# Install and Import Necessary Libraries and Packages
import pandas as pd
import pandas_datareader as pdr
import pandas_datareader.data as web
import numpy as np
import yfinance as yf
import matplotlib.pyplot as plt
import ta as ta
import requests
import os
from bs4 import BeautifulSoup
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from scipy import stats
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects import conversion
from scipy.stats import skew, kurtosis
pandas2ri.activate()
Before diving into the core research project, it's valuable to take a comprehensive look at the ta package. This package offers a wide array of built-in technical indicators, which can serve as powerful tools for developing and testing trading strategies.
However, understanding the mathematical foundations and the various parameters available within the ta package is crucial to tailoring these indicators effectively to fit specific research objectives. While ta offers a robust set of tools, there will undoubtedly be cases where further customization is required to meet more complex research needs. This is where creating custom technical indicators based on specific financial criteria becomes essential.
By combining the built-in tools with personalized adjustments, it becomes possible to address nuanced research questions and trading strategies that go beyond standard implementations. Nevertheless, exploring and experimenting with different strategies using the capabilities of ta is not only feasible but also provides a solid foundation for generating insights and enhancing decision-making.
ta package.Statistical Analysis:
Visualizations:
Feature Distributions:
The journey so far has been very insightful, revealing that traditional regression and volatility modeling may not always yield the best results in complex financial datasets. However, the lessons learned from this exploration will guide my next steps toward building a more robust quantitative strategy.
# Understanding ta library more comprehensively
# Uncomment code to run
# print(dir(ta))
# Determine submodules of interest
# Uncomment code to run
# for submodule in dir(ta):
# if not submodule.startswith("__"):
# print(submodule)
# List of Trend-based Technical Indicators and how to customize arguments and parameters
# Remove # to learn more
# help(ta.trend)
# List of Volatility-based Technical Indicators and how to customize arguments and parameters
# Remove # to learn more
# help(ta.volatility)
# List of Momentum Based Technical Indicators and how to customize arguments and parameters
# Remove # to learn more
# help(ta.momentum)
# Extensive list of Volume-based Technical Indicators and how to customize arguments and parameters
# Remove # to learn more
# help(ta.volume)
# Popular Technical Indicators bundled together
# Remove # to learn more
# help(ta.wrapper)
# Remove # to learn more
# help(ta.add_others_ta)
# Remove # to learn more
# help(ta.add_trend_ta)
# Remove # to learn more
# help(ta.add_volatility_ta)
# Remove # to learn more
# help(ta.add_volume_ta)
# Remove # to learn more
# help(ta.others)
# Remove # to learn more
# help(ta.utils)
# In depth analysis of ta package and submodules is done
# Initially accessed list of tickers for S&P 500 stocks using slickcharts
# Used headers to simulate a request from a web browser
# Create a function that grabs all S&P 500 Tickers
# Creating this function in anticipation of using it to extract data easily from yfinance
def sp_500_tickers():
# Grab url
url = 'https://www.slickcharts.com/sp500'
# Simulate a request from a web browser
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
#Error Handling
try:
response = requests.get(url, headers=headers)
response.raise_for_status() # Ensure the request was successful
except requests.exceptions.HTTPError as http_err:
print("HTTP Error has occurred")
return []
except Exception as err:
print("Error has occurred")
return []
# Parse the HTML content of the webpage
html_parser = BeautifulSoup(response.text, 'html.parser')
# Scrape html file using inspect and determine how to extract table
# Table was labeled 'table table-hover table-borderless table-sm'
table = html_parser.find('table', {'class': 'table table-hover table-borderless table-sm'})
if table is None:
print("Could not find the table on the webpage.")
return []
tickers = []
ticker_column_index = 2 # The index of the column containing the ticker symbol
# Go through all table rows
for row in table.find_all('tr')[1:]: # Skip the header row
cells = row.find_all('td') # Extract the columns of each row
if len(cells) > ticker_column_index:
ticker = cells[ticker_column_index].text.strip()
tickers.append(ticker)
return tickers
tickers = sp_500_tickers()
print(f"Number of tickers fetched: {len(tickers)}")
Number of tickers fetched: 503
# Remove # to learn more
# help(yf.Ticker)
def validate_tickers(tickers): # Function to validate the tickers by fetching minimal data using yfinance
valid_tickers = []
for ticker in tickers:
# Ran into problem here
# Determined the root cause was the syntax carried over from Wikipedia was not compatible with yfinance
# Made sure that data has been read in wikipedia and yfinance together and data is not empty for any ticker
yf_ticker = ticker.replace('.', '-')
stock = yf.Ticker(yf_ticker)
hist = stock.history(period="1m")
if not hist.empty:
valid_tickers.append(yf_ticker) # Append the ticker, not the data
return valid_tickers
# Validate the tickers
valid_tickers = validate_tickers(tickers)
print(f"Number of valid tickers: {len(valid_tickers)}")
Number of valid tickers: 503
def fetch_historical_data(valid_tickers, start_date='2009-01-01', end_date='2024-01-01'): # Function to fetch historical data for validated tickers using yfinance
all_data = {} # Initialize all_data dictionary
for ticker in valid_tickers:
stock = yf.Ticker(ticker)
try:
# Fetch historical market data for the specified period
hist = stock.history(start=start_date, end=end_date)
if not hist.empty:
all_data[ticker] = hist
else:
print(f"No data found for {ticker}.")
except Exception as e:
print(f"Error fetching historical data for {ticker}: {e}")
return all_data
# Fetch full historical data for the validated tickers
historical_data = fetch_historical_data(valid_tickers)
print(f"Number of tickers with historical data: {len(historical_data)}")
GEV: Data doesn't exist for startDate = 1230786000, endDate = 1704085200
No data found for GEV.
SW: Data doesn't exist for startDate = 1230786000, endDate = 1704085200
No data found for SW.
SOLV: Data doesn't exist for startDate = 1230786000, endDate = 1704085200
No data found for SOLV. Number of tickers with historical data: 500
# Select several stocks for visualization to make sure data has been extracted as desired
selected_tickers = ['AAPL', 'MSFT', 'GOOGL'] # Selected stock tickers for visualization
plt.figure(figsize=(14, 8))
for ticker in selected_tickers:
if ticker in historical_data:
plt.plot(historical_data[ticker].index, historical_data[ticker]['Close'], label=ticker)
plt.title('Historical Close Prices of Selected Stocks from Slick')
plt.xlabel('Date')
plt.ylabel('Close Price')
plt.legend()
plt.grid(True)
plt.show()
def sp_500_tickers_wiki(): # Function to scrape S&P 500 tickers from Wikipedia
url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies' # Extracted the same data from Wikipedia to ensure consistency and accuracy from external websites
response = requests.get(url)
response.raise_for_status()
# Parse the HTML content of the webpage
html_parser = BeautifulSoup(response.text, 'html.parser')
# Scrape HTML file using inspect and determine how to extract table
table = html_parser.find('table', {'id': 'constituents'})
# Verify if the table was found
if table is None:
print("Could not find the table on the webpage.")
return []
# List to store the extracted ticker symbols
tickers = []
ticker_column_index = 0 # The index of the column containing the ticker symbol
# Go through all table rows
for row in table.find_all('tr')[1:]: # Skip the header row
cells = row.find_all('td') # Extract the columns of each row
if len(cells) > ticker_column_index:
ticker = cells[ticker_column_index].text.strip()
tickers.append(ticker)
return tickers
# Example usage, and testing to make sure the process works
tickers_wiki = sp_500_tickers_wiki()
print(f"Number of tickers fetched: {len(tickers)}")
# Function to validate the tickers by fetching minimal data using yfinance
def validate_tickers_wiki(tickers_wiki):
valid_tickers = []
for ticker in tickers_wiki:
yf_ticker = ticker.replace('.', '-')
stock = yf.Ticker(yf_ticker)
hist = stock.history(period="1m")
if not hist.empty:
valid_tickers.append(yf_ticker) # Append the ticker, not the data
return valid_tickers
# Validate the tickers
valid_tickers_wiki = validate_tickers_wiki(tickers_wiki)
print(f"Number of valid tickers: {len(valid_tickers_wiki)}")
# Function to fetch historical data for validated tickers using yfinance
def fetch_historical_data_wiki(valid_tickers_wiki, start_date='2009-01-01', end_date='2024-01-01'):
all_data = {} # Initialize all_data dictionary
for ticker in valid_tickers_wiki:
stock = yf.Ticker(ticker)
try:
# Fetch historical market data for the specified period
hist = stock.history(start=start_date, end=end_date)
if not hist.empty:
all_data[ticker] = hist
else:
print(f"No data found for {ticker}.")
except Exception as e:
print(f"Error fetching historical data for {ticker}: {e}")
return all_data
# Fetch full historical data for the validated tickers
historical_data_wiki = fetch_historical_data_wiki(valid_tickers_wiki)
print(f"Number of tickers with historical data: {len(historical_data_wiki)}")
# Select several stocks for visualization to make sure data has been extracted as desired
selected_tickers_wiki = ['AAPL', 'MSFT', 'GOOGL'] # Selected stock tickers for visualization
plt.figure(figsize=(14, 8))
for ticker in selected_tickers_wiki:
if ticker in historical_data_wiki:
plt.plot(historical_data_wiki[ticker].index, historical_data_wiki[ticker]['Close'], label=ticker)
plt.title('Historical Close Prices of Selected Stocks from Wiki')
plt.xlabel('Date')
plt.ylabel('Close Price')
plt.legend()
plt.grid(True)
plt.show()
Number of tickers fetched: 503 Number of valid tickers: 503
GEV: Data doesn't exist for startDate = 1230786000, endDate = 1704085200
No data found for GEV.
SW: Data doesn't exist for startDate = 1230786000, endDate = 1704085200
No data found for SW.
SOLV: Data doesn't exist for startDate = 1230786000, endDate = 1704085200
No data found for SOLV. Number of tickers with historical data: 500
def apply_all_indicators(data):
# Ensure necessary columns are present
if 'Open' in data.columns and 'High' in data.columns and 'Low' in data.columns and 'Close' in data.columns and 'Volume' in data.columns:
# Add all technical indicators using ta
data = ta.add_all_ta_features(
data, open="Open", high="High", low="Low", close="Close", volume="Volume", fillna=True)
else:
print("Required columns are missing from the data.")
return data
def fetch_and_enhance_data(tickers, start_date, end_date):
# Initialize dictionary to store the historical data
all_data = {}
# Fetch data for each ticker
for ticker in tickers:
try:
stock = yf.Ticker(ticker)
historical_data = stock.history(start=start_date, end=end_date)
if not historical_data.empty:
# Apply all technical indicators
all_data[ticker] = apply_all_indicators(historical_data)
except Exception as e:
print(f"Error processing {ticker}: {e}")
return all_data
# Define your tickers and time periods
tickers = ['AAPL', 'MSFT', 'GOOGL'] # Example subset for testing
start_date = '2010-01-01'
end_date = '2023-12-29'
# Fetch and enhance data
enhanced_data = fetch_and_enhance_data(tickers, start_date, end_date)
# Looking at documentation, others_ta indicator others_cr is calculated only using Close price, our response, and results in a perfect correlation between others_cr and Close
# This could potentially ruin the model because of perfect multicollinearity.
enhanced_data['AAPL'] = enhanced_data['AAPL'].drop(columns=['others_cr'])
# Now you can inspect or save the enhanced data
if 'AAPL' in enhanced_data:
print(enhanced_data['AAPL'].head()) # Display some of the data
Open High Low Close Volume \
Date
2010-01-04 00:00:00-05:00 6.437013 6.469284 6.405345 6.454506 493729600
2010-01-05 00:00:00-05:00 6.472299 6.502157 6.431583 6.465664 601904800
2010-01-06 00:00:00-05:00 6.465665 6.491301 6.356185 6.362820 552160000
2010-01-07 00:00:00-05:00 6.386344 6.393885 6.304913 6.351057 477131200
2010-01-08 00:00:00-05:00 6.342612 6.393884 6.305214 6.393280 447610800
Dividends Stock Splits volume_adi volume_obv \
Date
2010-01-04 00:00:00-05:00 0.0 0.0 2.654966e+08 493729600
2010-01-05 00:00:00-05:00 0.0 0.0 2.449217e+08 1095634400
2010-01-06 00:00:00-05:00 0.0 0.0 -2.530072e+08 543474400
2010-01-07 00:00:00-05:00 0.0 0.0 -2.352198e+08 66343200
2010-01-08 00:00:00-05:00 0.0 0.0 2.062968e+08 513954000
volume_cmf ... momentum_roc momentum_ppo \
Date ...
2010-01-04 00:00:00-05:00 0.537737 ... 0.0 0.000000
2010-01-05 00:00:00-05:00 0.223543 ... 0.0 0.013789
2010-01-06 00:00:00-05:00 -0.153543 ... 0.0 -0.102764
2010-01-07 00:00:00-05:00 -0.110696 ... 0.0 -0.207705
2010-01-08 00:00:00-05:00 0.080192 ... 0.0 -0.235305
momentum_ppo_signal momentum_ppo_hist \
Date
2010-01-04 00:00:00-05:00 0.000000 0.000000
2010-01-05 00:00:00-05:00 0.002758 0.011031
2010-01-06 00:00:00-05:00 -0.018347 -0.084418
2010-01-07 00:00:00-05:00 -0.056218 -0.151487
2010-01-08 00:00:00-05:00 -0.092036 -0.143270
momentum_pvo momentum_pvo_signal \
Date
2010-01-04 00:00:00-05:00 0.000000 0.000000
2010-01-05 00:00:00-05:00 1.719878 0.343976
2010-01-06 00:00:00-05:00 2.240194 0.723219
2010-01-07 00:00:00-05:00 1.454246 0.869425
2010-01-08 00:00:00-05:00 0.349630 0.765466
momentum_pvo_hist momentum_kama others_dr \
Date
2010-01-04 00:00:00-05:00 0.000000 6.454506 0.000000
2010-01-05 00:00:00-05:00 1.375902 6.459657 0.172871
2010-01-06 00:00:00-05:00 1.516975 6.415663 -1.590614
2010-01-07 00:00:00-05:00 0.584821 6.386017 -0.184872
2010-01-08 00:00:00-05:00 -0.415836 6.389281 0.664818
others_dlr
Date
2010-01-04 00:00:00-05:00 0.000000
2010-01-05 00:00:00-05:00 0.172722
2010-01-06 00:00:00-05:00 -1.603400
2010-01-07 00:00:00-05:00 -0.185043
2010-01-08 00:00:00-05:00 0.662618
[5 rows x 92 columns]
print(enhanced_data['AAPL'].columns)
Index(['Open', 'High', 'Low', 'Close', 'Volume', 'Dividends', 'Stock Splits',
'volume_adi', 'volume_obv', 'volume_cmf', 'volume_fi', 'volume_em',
'volume_sma_em', 'volume_vpt', 'volume_vwap', 'volume_mfi',
'volume_nvi', 'volatility_bbm', 'volatility_bbh', 'volatility_bbl',
'volatility_bbw', 'volatility_bbp', 'volatility_bbhi',
'volatility_bbli', 'volatility_kcc', 'volatility_kch', 'volatility_kcl',
'volatility_kcw', 'volatility_kcp', 'volatility_kchi',
'volatility_kcli', 'volatility_dcl', 'volatility_dch', 'volatility_dcm',
'volatility_dcw', 'volatility_dcp', 'volatility_atr', 'volatility_ui',
'trend_macd', 'trend_macd_signal', 'trend_macd_diff', 'trend_sma_fast',
'trend_sma_slow', 'trend_ema_fast', 'trend_ema_slow',
'trend_vortex_ind_pos', 'trend_vortex_ind_neg', 'trend_vortex_ind_diff',
'trend_trix', 'trend_mass_index', 'trend_dpo', 'trend_kst',
'trend_kst_sig', 'trend_kst_diff', 'trend_ichimoku_conv',
'trend_ichimoku_base', 'trend_ichimoku_a', 'trend_ichimoku_b',
'trend_stc', 'trend_adx', 'trend_adx_pos', 'trend_adx_neg', 'trend_cci',
'trend_visual_ichimoku_a', 'trend_visual_ichimoku_b', 'trend_aroon_up',
'trend_aroon_down', 'trend_aroon_ind', 'trend_psar_up',
'trend_psar_down', 'trend_psar_up_indicator',
'trend_psar_down_indicator', 'momentum_rsi', 'momentum_stoch_rsi',
'momentum_stoch_rsi_k', 'momentum_stoch_rsi_d', 'momentum_tsi',
'momentum_uo', 'momentum_stoch', 'momentum_stoch_signal', 'momentum_wr',
'momentum_ao', 'momentum_roc', 'momentum_ppo', 'momentum_ppo_signal',
'momentum_ppo_hist', 'momentum_pvo', 'momentum_pvo_signal',
'momentum_pvo_hist', 'momentum_kama', 'others_dr', 'others_dlr'],
dtype='object')
# Grab the specific stock of interest
aapl_data = enhanced_data['AAPL']
# Create a Date column
aapl_data = aapl_data.reset_index()
# Check for any missing or null values
print(aapl_data.isnull().sum())
# Convert 'volume_obv' to a larger data type, like float64 because computer cannot store current type
aapl_data['volume_obv'] = aapl_data['volume_obv'].astype(np.float64)
# Now convert to R
aapl_r_df = pandas2ri.py2rpy(aapl_data)
# Assign df to the R environment for further use
ro.globalenv['aapl_r_df'] = aapl_r_df
# Set up R for complex statistical analysis
ro.r('''
library(randomForest) # For nonlinear relationships
library(MASS) # For LTS, stepAIC
library(car) # Companion to Applied Regression
library(forecast) # Time Series forecasting
library(tseries)
library(lmtest) # For regression diagnostics
library(ggplot2) # For visualization
library(data.table) # For fast data manipulation
library(dplyr) # For data manipulation
library(boot) # For bootstrap
library(glmnet) # For ridge/lasso regression
library(quantreg) # For quantile regression
library(leaps) # For model selection, regsubsets
library(robustbase) # For robust linear regression
library(caret) # For cross-validation and model training
library(MCMCpack) # For MCMC regression
library(mgcv) # For generalized additive model
''')
Date 0
Open 0
High 0
Low 0
Close 0
..
momentum_pvo_signal 0
momentum_pvo_hist 0
momentum_kama 0
others_dr 0
others_dlr 0
Length: 93, dtype: int64
# Ensure that we are operating on the desired data in R
ro.r('''
summary <- summary(aapl_r_df)
print(summary)
''')
Date Open High Open High
Min. :2010-01-04 00:00:00.00 Min. : 5.802 Min. : 5.911
1st Qu.:2013-07-03 00:00:00.00 1st Qu.: 17.001 1st Qu.: 17.136
Median :2016-12-29 00:00:00.00 Median : 29.101 Median : 29.291
Mean :2016-12-30 04:53:46.80 Mean : 57.621 Mean : 58.244
3rd Qu.:2020-06-30 00:00:00.00 3rd Qu.: 88.869 3rd Qu.: 89.937
Max. :2023-12-28 00:00:00.00 Max. :197.271 Max. :198.865
Low Close Volume Dividends
Min. : 5.738 Min. : 5.792 Min. :2.405e+07 Min. :0.000000
1st Qu.: 16.817 1st Qu.: 16.983 1st Qu.:9.277e+07 1st Qu.:0.000000
Median : 28.849 Median : 29.074 Median :1.527e+08 Median :0.000000
Mean : 57.028 Mean : 57.663 Mean :2.423e+08 Mean :0.002201
3rd Qu.: 87.794 3rd Qu.: 88.964 3rd Qu.:3.285e+08 3rd Qu.:0.000000
Max. :196.255 Max. :197.361 Max. :1.881e+09 Max. :0.240000
Stock Splits volume_adi volume_obv
Min. :0.000000 Min. :-2.342e+09 Min. :-2.336e+09
1st Qu.:0.000000 1st Qu.: 1.621e+10 1st Qu.: 1.213e+10
Median :0.000000 Median : 2.024e+10 Median : 1.456e+10
Mean :0.003124 Mean : 1.963e+10 Mean : 1.409e+10
3rd Qu.:0.000000 3rd Qu.: 2.473e+10 3rd Qu.: 1.729e+10
Max. :7.000000 Max. : 2.857e+10 Max. : 2.518e+10
volume_cmf volume_fi volume_em
Min. :-0.38278 Min. :-501620861 Min. :-41.30846
1st Qu.:-0.06101 1st Qu.: -17467431 1st Qu.: -0.04188
Median : 0.03673 Median : 5344530 Median : 0.00247
Mean : 0.04584 Mean : 2394485 Mean : 0.06311
3rd Qu.: 0.15407 3rd Qu.: 27102971 3rd Qu.: 0.09083
Max. : 0.53774 Max. : 568055034 Max. : 40.25804
volume_sma_em volume_vpt volume_vwap volume_mfi
Min. :-8.818908 Min. :-85171432 Min. : 5.933 Min. : 5.124
1st Qu.:-0.008155 1st Qu.:112963429 1st Qu.: 16.847 1st Qu.: 42.332
Median : 0.001326 Median :164316649 Median : 28.824 Median : 54.649
Mean : 0.065423 Mean :167624514 Mean : 57.261 Mean : 54.172
3rd Qu.: 0.055211 3rd Qu.:245459705 3rd Qu.: 86.250 3rd Qu.: 66.315
Max. : 7.322233 Max. :381590429 Max. :194.795 Max. :100.000
volume_nvi volatility_bbm volatility_bbh volatility_bbl
Min. : 920.5 Min. : 5.977 Min. : 6.189 Min. : 5.65
1st Qu.: 3726.5 1st Qu.: 16.964 1st Qu.: 17.844 1st Qu.: 15.99
Median : 7492.4 Median : 28.737 Median : 29.904 Median : 27.72
Mean :18714.8 Mean : 57.157 Mean : 60.514 Mean : 53.80
3rd Qu.:25319.0 3rd Qu.: 84.898 3rd Qu.: 91.217 3rd Qu.: 78.54
Max. :73894.0 Max. :193.445 Max. :201.893 Max. :188.73
volatility_bbw volatility_bbp volatility_bbhi volatility_bbli
Min. : 0.000 Min. :-0.3265 Min. :0.00000 Min. :0.00000
1st Qu.: 7.511 1st Qu.: 0.3219 1st Qu.:0.00000 1st Qu.:0.00000
Median :10.657 Median : 0.6525 Median :0.00000 Median :0.00000
Mean :11.409 Mean : 0.5908 Mean :0.08066 Mean :0.04289
3rd Qu.:14.444 3rd Qu.: 0.8612 3rd Qu.:0.00000 3rd Qu.:0.00000
Max. :35.339 Max. : 1.3928 Max. :1.00000 Max. :1.00000
volatility_kcc volatility_kch volatility_kcl volatility_kcw
Min. : 5.894 Min. : 6.047 Min. : 5.738 Min. : 1.494
1st Qu.: 16.922 1st Qu.: 17.261 1st Qu.: 16.612 1st Qu.: 2.955
Median : 28.859 Median : 29.386 Median : 28.392 Median : 3.653
Mean : 57.406 Mean : 58.620 Mean : 56.192 Mean : 3.997
3rd Qu.: 87.366 3rd Qu.: 89.444 3rd Qu.: 85.287 3rd Qu.: 4.712
Max. :194.979 Max. :197.483 Max. :192.612 Max. :14.400
volatility_kcp volatility_kchi volatility_kcli volatility_dcl
Min. :-1.8668 Min. :0.0000 Min. :0.000 Min. : 5.738
1st Qu.: 0.1850 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.: 15.982
Median : 0.6817 Median :0.0000 Median :0.000 Median : 27.610
Mean : 0.6763 Mean :0.3473 Mean :0.186 Mean : 53.536
3rd Qu.: 1.1750 3rd Qu.:1.0000 3rd Qu.:0.000 3rd Qu.: 78.229
Max. : 3.5342 Max. :1.0000 Max. :1.000 Max. :186.741
volatility_dch volatility_dcm volatility_dcw volatility_dcp
Min. : 6.188 Min. : 5.963 Min. : 0.9906 Min. :0.0000
1st Qu.: 17.838 1st Qu.: 16.819 1st Qu.: 8.3524 1st Qu.:0.3503
Median : 29.902 Median : 28.861 Median :11.1999 Median :0.6869
Mean : 60.527 Mean : 57.031 Mean :11.8711 Mean :0.6141
3rd Qu.: 90.813 3rd Qu.: 84.521 3rd Qu.:14.4380 3rd Qu.:0.8880
Max. :198.865 Max. :192.803 Max. :35.0159 Max. :1.0000
volatility_atr volatility_ui trend_macd trend_macd_signal
Min. :0.0000 Min. : 0.000 Min. :-6.5255 Min. :-5.6851
1st Qu.:0.3291 1st Qu.: 1.276 1st Qu.:-0.1173 1st Qu.:-0.1126
Median :0.5740 Median : 2.670 Median : 0.1837 Median : 0.1769
Mean :1.3404 Mean : 3.560 Mean : 0.3652 Mean : 0.3622
3rd Qu.:2.3320 3rd Qu.: 5.181 3rd Qu.: 0.7268 3rd Qu.: 0.6789
Max. :6.2172 Max. :17.317 Max. : 7.3736 Max. : 6.7762
trend_macd_diff trend_sma_fast trend_sma_slow trend_ema_fast
Min. :-2.459740 Min. : 5.914 Min. : 6.022 Min. : 5.96
1st Qu.:-0.100186 1st Qu.: 16.880 1st Qu.: 16.951 1st Qu.: 16.98
Median : 0.002324 Median : 28.807 Median : 28.717 Median : 28.75
Mean : 0.003017 Mean : 57.371 Mean : 56.999 Mean : 57.37
3rd Qu.: 0.112551 3rd Qu.: 86.953 3rd Qu.: 83.280 3rd Qu.: 86.72
Max. : 2.341696 Max. :194.881 Max. :192.541 Max. :194.28
trend_ema_slow trend_vortex_ind_pos trend_vortex_ind_neg
Min. : 6.074 Min. :0.0000 Min. :0.0000
1st Qu.: 17.016 1st Qu.:0.8603 1st Qu.:0.7469
Median : 28.672 Median :1.0024 Median :0.8772
Mean : 57.006 Mean :0.9938 Mean :0.8766
3rd Qu.: 83.680 3rd Qu.:1.1307 3rd Qu.:1.0041
Max. :191.599 Max. :1.4721 Max. :1.3484
trend_vortex_ind_diff trend_trix trend_mass_index trend_dpo
Min. :-0.7792 Min. :-88.58832 Min. : 1.00 Min. :-13.51912
1st Qu.:-0.1386 1st Qu.: -0.07125 1st Qu.:24.21 1st Qu.: -0.47526
Median : 0.1265 Median : 0.11215 Median :24.94 Median : -0.03936
Mean : 0.1172 Mean : 0.07117 Mean :24.97 Mean : 0.07826
3rd Qu.: 0.3694 3rd Qu.: 0.27553 3rd Qu.:25.72 3rd Qu.: 0.35525
Max. : 1.0567 Max. : 0.94081 Max. :29.75 Max. : 51.31237
trend_kst trend_kst_sig trend_kst_diff trend_ichimoku_conv
Min. :-889.86 Min. :-889.09 Min. :-80.7051 Min. : 5.897
1st Qu.: -17.42 1st Qu.: -17.11 1st Qu.:-13.0326 1st Qu.: 16.898
Median : 25.36 Median : 25.54 Median : -0.2415 Median : 28.972
Mean : 17.98 Mean : 16.90 Mean : 1.0807 Mean : 57.359
3rd Qu.: 69.12 3rd Qu.: 68.38 3rd Qu.: 14.2356 3rd Qu.: 87.493
Max. : 239.82 Max. : 230.28 Max. :151.0955 Max. :195.483
trend_ichimoku_base trend_ichimoku_a trend_ichimoku_b trend_stc
Min. : 6.05 Min. : 6.008 Min. : 6.12 Min. : 0.000
1st Qu.: 16.80 1st Qu.: 16.947 1st Qu.: 16.72 1st Qu.: 1.581
Median : 28.86 Median : 28.880 Median : 28.67 Median : 56.989
Mean : 56.87 Mean : 57.113 Mean : 56.24 Mean : 52.617
3rd Qu.: 83.58 3rd Qu.: 85.538 3rd Qu.: 77.68 3rd Qu.: 99.609
Max. :192.80 Max. :194.143 Max. :186.30 Max. :100.000
trend_adx trend_adx_pos trend_adx_neg trend_cci
Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. :-382.86
1st Qu.:18.60 1st Qu.:19.84 1st Qu.:16.63 1st Qu.: -58.80
Median :24.78 Median :25.88 Median :22.33 Median : 50.00
Mean :26.61 Mean :26.55 Mean :22.89 Mean : 29.83
3rd Qu.:32.90 3rd Qu.:32.37 3rd Qu.:28.55 3rd Qu.: 113.70
Max. :70.18 Max. :61.42 Max. :57.93 Max. : 354.35
trend_visual_ichimoku_a trend_visual_ichimoku_b trend_aroon_up
Min. : 6.008 Min. : 6.12 Min. : 0.0
1st Qu.: 16.947 1st Qu.: 16.72 1st Qu.: 20.0
Median : 28.880 Median : 28.67 Median : 68.0
Mean : 56.145 Mean : 55.33 Mean : 58.2
3rd Qu.: 75.865 3rd Qu.: 70.97 3rd Qu.: 96.0
Max. :191.937 Max. :186.30 Max. :100.0
trend_aroon_down trend_aroon_ind trend_psar_up trend_psar_down
Min. : 0.00 Min. :-100.00 Min. : 5.738 Min. : 6.117
1st Qu.: 4.00 1st Qu.: -44.00 1st Qu.: 16.764 1st Qu.: 16.986
Median : 28.00 Median : 40.00 Median : 28.580 Median : 29.361
Mean : 38.08 Mean : 20.11 Mean : 56.392 Mean : 57.875
3rd Qu.: 68.00 3rd Qu.: 80.00 3rd Qu.: 84.951 3rd Qu.: 86.410
Max. :100.00 Max. : 100.00 Max. :193.546 Max. :198.865
trend_psar_up_indicator trend_psar_down_indicator momentum_rsi
Min. :0.00000 Min. :0.00000 Min. : 8.231
1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.: 45.785
Median :0.00000 Median :0.00000 Median : 55.489
Mean :0.04686 Mean :0.04715 Mean : 55.581
3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.: 65.089
Max. :1.00000 Max. :1.00000 Max. :100.000
momentum_stoch_rsi momentum_stoch_rsi_k momentum_stoch_rsi_d momentum_tsi
Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :-45.755
1st Qu.:0.1912 1st Qu.:0.2204 1st Qu.:0.2340 1st Qu.: -6.097
Median :0.5584 Median :0.5507 Median :0.5467 Median : 10.070
Mean :0.5343 Mean :0.5342 Mean :0.5341 Mean : 11.231
3rd Qu.:0.9082 3rd Qu.:0.8602 3rd Qu.:0.8445 3rd Qu.: 28.005
Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :100.000
momentum_uo momentum_stoch momentum_stoch_signal momentum_wr
Min. : 0.00 Min. : 0.00 Min. : 0.5865 Min. :-100.00
1st Qu.:46.20 1st Qu.: 33.35 1st Qu.:34.3925 1st Qu.: -66.65
Median :53.38 Median : 67.41 Median :67.3634 Median : -32.59
Mean :53.40 Mean : 60.22 Mean :60.2308 Mean : -39.78
3rd Qu.:61.05 3rd Qu.: 88.44 3rd Qu.:86.3243 3rd Qu.: -11.56
Max. :82.09 Max. :100.00 Max. :99.7457 Max. : 0.00
momentum_ao momentum_roc momentum_ppo momentum_ppo_signal
Min. :-17.2046 Min. :-24.278 Min. :-6.8288 Min. :-5.5258
1st Qu.: -0.3629 1st Qu.: -2.323 1st Qu.:-0.5307 1st Qu.:-0.4537
Median : 0.4021 Median : 1.547 Median : 0.8099 Median : 0.7765
Mean : 0.7681 Mean : 1.344 Mean : 0.6364 Mean : 0.6348
3rd Qu.: 1.7286 3rd Qu.: 5.244 3rd Qu.: 1.9464 3rd Qu.: 1.8350
Max. : 17.4806 Max. : 23.554 Max. : 6.4446 Max. : 6.1186
momentum_ppo_hist momentum_pvo momentum_pvo_signal momentum_pvo_hist
Min. :-2.189591 Min. :-21.719 Min. :-16.082 Min. :-10.336444
1st Qu.:-0.353285 1st Qu.: -6.900 1st Qu.: -5.946 1st Qu.: -2.914235
Median :-0.008357 Median : -2.095 Median : -1.604 Median : -0.481628
Mean : 0.001584 Mean : -1.020 Mean : -1.019 Mean : -0.000683
3rd Qu.: 0.367885 3rd Qu.: 4.027 3rd Qu.: 3.333 3rd Qu.: 2.403968
Max. : 2.325097 Max. : 31.224 Max. : 19.575 Max. : 25.810565
momentum_kama others_dr others_dlr
Min. : 6.089 Min. :-12.86471 Min. :-13.77082
1st Qu.: 16.900 1st Qu.: -0.75158 1st Qu.: -0.75442
Median : 29.027 Median : 0.09504 Median : 0.09499
Mean : 57.513 Mean : 0.11231 Mean : 0.09648
3rd Qu.: 85.527 3rd Qu.: 1.04726 3rd Qu.: 1.04182
Max. :192.374 Max. : 11.98083 Max. : 11.31575
# The first thing we want to do, is split the data into training and test set
ro.r('''
aapl_r_df$Date <- as.Date(aapl_r_df$Date)
train_data <- aapl_r_df[aapl_r_df$Date < as.Date("2021-01-01"), ]
test_data <- aapl_r_df[aapl_r_df$Date >= as.Date("2021-01-01"), ]
# Show the first few rows of train dataset
print(head(train_data))
''')
Date Open High Low Date Open High Low Close Volume Dividends
0 2010-01-04 6.437013 6.469284 6.405345 6.454506 493729600 0
1 2010-01-05 6.472299 6.502157 6.431583 6.465664 601904800 0
2 2010-01-06 6.465665 6.491301 6.356185 6.362820 552160000 0
3 2010-01-07 6.386344 6.393885 6.304913 6.351057 477131200 0
4 2010-01-08 6.342612 6.393884 6.305214 6.393280 447610800 0
5 2010-01-11 6.418012 6.424045 6.286817 6.336883 462229600 0
Stock Splits volume_adi volume_obv volume_cmf volume_fi volume_em
0 0 265496575 493729600 0.53773680 0.0 0.000000e+00
1 0 244921715 1095634400 0.22354329 6716047.4 3.465412e-04
2 0 -253007153 543474400 -0.15354291 -2355703.8 -1.055343e-03
3 0 -235219817 66343200 -0.11069555 -2820966.1 -1.386319e-03
4 0 206296808 513954000 0.08019199 281951.9 2.976624e-06
5 0 81344925 51724400 0.02680435 -3482376.2 1.746282e-04
volume_sma_em volume_vpt volume_vwap volume_mfi volume_nvi volatility_bbm
0 0.0000000000 0 6.443045 50.00000 1000.0000 6.454506
1 0.0003465412 1040521 6.455913 100.00000 1000.0000 6.460085
2 -0.0003544007 -7742214 6.438328 52.39958 984.0939 6.427663
3 -0.0006983736 -8624298 6.418484 37.21857 982.2745 6.408512
4 -0.0005230360 -5648500 6.409026 50.65897 988.8049 6.405465
5 -0.0003835032 -9725960 6.399921 41.50478 988.8049 6.394035
volatility_bbh volatility_bbl volatility_bbw volatility_bbp volatility_bbhi
0 6.454506 6.454506 0.0000000 0.0000000 0
1 6.471243 6.448927 0.3454441 0.7500000 0
2 6.519817 6.335510 2.8674042 0.1481786 0
3 6.512293 6.304730 3.2388640 0.2231939 0
4 6.499087 6.311844 2.9231717 0.4349220 0
5 6.493620 6.294450 3.1149406 0.2130500 0
volatility_bbli volatility_kcc volatility_kch volatility_kcl volatility_kcw
0 0 6.443045 6.506984 6.379106 1.984735
1 0 6.454757 6.522013 6.387500 2.083938
2 0 6.437650 6.527526 6.347773 2.792214
3 0 6.415725 6.505375 6.326075 2.794706
4 0 6.405405 6.494859 6.315951 2.793084
5 0 6.396046 6.493462 6.298629 3.046143
volatility_kcp volatility_kchi volatility_kcli volatility_dcl volatility_dch
0 0.58962280 0 0 6.405345 6.469284
1 0.58108717 0 0 6.405345 6.502157
2 0.08370979 0 0 6.356185 6.502157
3 0.13933230 0 0 6.304913 6.502157
4 0.43222692 0 0 6.304913 6.502157
5 0.19634120 0 0 6.286817 6.502157
volatility_dcm volatility_dcw volatility_dcp volatility_atr volatility_ui
0 6.437315 0.9906056 0.76886840 0 0
1 6.453751 1.4986158 0.62304802 0 0
2 6.429171 2.2710041 0.04545593 0 0
3 6.403535 3.0778545 0.23394494 0 0
4 6.403535 3.0793183 0.44800890 0 0
5 6.394487 3.3678253 0.23249596 0 0
trend_macd trend_macd_signal trend_macd_diff trend_sma_fast trend_sma_slow
0 0.000000000 0.0000000000 0.0000000000 6.454506 6.454506
1 0.000890096 0.0001780192 0.0007120768 6.460085 6.460085
2 -0.006626740 -0.0011829326 -0.0054438070 6.427663 6.427663
3 -0.013378846 -0.0036221153 -0.0097567311 6.408512 6.408512
4 -0.015148279 -0.0059273480 -0.0092209306 6.405465 6.405465
5 -0.020860859 -0.0089140501 -0.0119468085 6.394035 6.394035
trend_ema_fast trend_ema_slow trend_vortex_ind_pos trend_vortex_ind_neg
0 6.454506 6.454506 0.000000000 0.0000000000
1 6.456223 6.455332 0.001886121 0.0007345085
2 6.441853 6.448480 0.003041566 0.0035689991
3 6.427884 6.441263 0.003767603 0.0071783390
4 6.422561 6.437709 0.005483999 0.0088830590
5 6.409379 6.430240 0.007764440 0.0109272896
trend_vortex_ind_diff trend_trix trend_mass_index trend_dpo trend_kst
0 0.0000000000 -8.858832e+01 1.000000 51.20881 -888.0656
1 0.0011516129 3.376393e-04 2.016537 51.20323 -887.9689
2 -0.0005274335 -2.225728e-03 3.195459 51.23565 -888.5311
3 -0.0034107358 -6.974152e-03 4.355135 51.25480 -888.8633
4 -0.0033990600 -1.169175e-02 5.494924 51.25785 -888.9161
5 -0.0031628495 -1.787098e-02 6.714334 51.26928 -889.1143
trend_kst_sig trend_kst_diff trend_ichimoku_conv trend_ichimoku_base
0 -888.0656 0.0000000 6.437315 6.437315
1 -888.0173 0.0483756 6.453751 6.453751
2 -888.1886 -0.3425875 6.429171 6.429171
3 -888.3572 -0.5060362 6.403535 6.403535
4 -888.4690 -0.4470929 6.403535 6.403535
5 -888.5766 -0.5377660 6.394487 6.394487
trend_ichimoku_a trend_ichimoku_b trend_stc trend_adx trend_adx_pos
0 6.437315 6.437315 0 0 0
1 6.453751 6.453751 0 0 0
2 6.429171 6.429171 0 0 0
3 6.403535 6.403535 0 0 0
4 6.403535 6.403535 0 0 0
5 6.394487 6.394487 0 0 0
trend_adx_neg trend_cci trend_visual_ichimoku_a trend_visual_ichimoku_b
0 0 0.00000 57.11322 56.24289
1 0 66.66667 57.11322 56.24289
2 0 -100.00000 57.11322 56.24289
3 0 -112.34242 57.11322 56.24289
4 0 -69.70293 57.11322 56.24289
5 0 -74.98892 57.11322 56.24289
trend_aroon_up trend_aroon_down trend_aroon_ind trend_psar_up trend_psar_down
0 0 0 0 6.15562 6.469284
1 4 0 4 6.15562 6.469284
2 4 8 -4 6.15562 6.469284
3 4 12 -8 6.15562 6.502157
4 4 12 -8 6.15562 6.494268
5 4 20 -16 6.15562 6.486693
trend_psar_up_indicator trend_psar_down_indicator momentum_rsi
0 0 0 100.000000
1 0 0 100.000000
2 0 1 9.152434
3 0 0 8.231324
4 0 0 33.933418
5 0 0 24.188559
momentum_stoch_rsi momentum_stoch_rsi_k momentum_stoch_rsi_d momentum_tsi
0 0 0 0 0.00000
1 0 0 0 100.00000
2 0 0 0 81.42043
3 0 0 0 66.95266
4 0 0 0 58.04171
5 0 0 0 45.52531
momentum_uo momentum_stoch momentum_stoch_signal momentum_wr momentum_ao
0 0.00000 76.886840 76.88684 -23.11316 0.00000000
1 25.33655 62.304802 69.59582 -37.69520 0.00000000
2 15.10084 4.545593 47.91241 -95.45441 0.00000000
3 24.22207 23.394494 30.08163 -76.60551 0.00000000
4 39.10976 44.800890 24.24699 -55.19911 0.00000000
5 38.49324 23.249596 30.48166 -76.75040 -0.00805275
momentum_roc momentum_ppo momentum_ppo_signal momentum_ppo_hist momentum_pvo
0 0 0.00000000 0.000000000 0.00000000 0.0000000
1 0 0.01378854 0.002757708 0.01103083 1.7198781
2 0 -0.10276437 -0.018346708 -0.08441766 2.2401939
3 0 -0.20770532 -0.056218431 -0.15148689 1.4542460
4 0 -0.23530543 -0.092035831 -0.14326960 0.3496296
5 0 -0.32441803 -0.138512270 -0.18590576 -0.2972685
momentum_pvo_signal momentum_pvo_hist momentum_kama others_dr others_dlr
0 0.0000000 0.0000000 6.454506 0.0000000 0.0000000
1 0.3439756 1.3759025 6.459657 0.1728713 0.1727221
2 0.7232193 1.5169746 6.415663 -1.5906141 -1.6034001
3 0.8694246 0.5848213 6.386017 -0.1848724 -0.1850435
4 0.7654656 -0.4158360 6.389281 0.6648181 0.6626179
5 0.5529188 -0.8501873 6.365769 -0.8821288 -0.8860425
ro.r(''' # Show first few rows of test dataset
print(head(test_data))
''')
Date Open High Low Date Open High Low Close Volume Dividends
2769 2021-01-04 130.7068 130.7949 124.0893 126.6834 143301900 0
2770 2021-01-05 126.1744 128.9643 125.7241 128.2497 97664900 0
2771 2021-01-06 125.0290 128.2889 123.7173 123.9326 155088000 0
2772 2021-01-07 125.6555 128.8566 125.1661 128.1616 109578200 0
2773 2021-01-08 129.6398 129.8356 127.4861 129.2678 105158200 0
2774 2021-01-11 126.4681 127.4274 125.7926 126.2625 100384500 0
Stock Splits volume_adi volume_obv volume_cmf volume_fi volume_em
2769 0 25411723202 18006868300 -0.014690928 -17651971 -13.948546
2770 0 25466308843 18104533200 0.007171973 6722631 -0.324804
2771 0 25325833051 17949445200 -0.066527463 -89884281 -3.953301
2772 0 25394137421 18059023400 -0.048975208 -10843558 3.395838
2773 0 25448469191 18164181600 0.009338842 7323769 3.685345
2774 0 25405790301 18063797100 -0.032781904 -36820671 -3.339920
volume_sma_em volume_vpt volume_vwap volume_mfi volume_nvi volatility_bbm
2769 1.07495835 253708522 127.4522 54.91323 34693.63 125.3550
2770 0.92815558 254916022 127.8364 55.54984 35122.57 125.7838
2771 0.05989194 249695536 127.9674 46.36423 35122.57 125.9233
2772 0.11952482 253434671 128.1110 46.78974 36321.06 126.2434
2773 0.23250602 254342335 128.2734 47.19919 36634.56 126.7461
2774 0.12107051 252008519 128.6055 49.81354 35782.85 127.0270
volatility_bbh volatility_bbl volatility_bbw volatility_bbp
2769 134.2932 116.4168 14.26061 0.5743106
2770 134.4081 117.1595 13.71282 0.6429638
2771 134.3303 117.5163 13.35254 0.3816069
2772 134.4776 118.0092 13.04494 0.6164770
2773 134.4102 119.0820 12.09363 0.6645151
2774 134.1699 119.8841 11.24627 0.4464819
volatility_bbhi volatility_bbli volatility_kcc volatility_kch
2769 0 0 129.0913 132.7476
2770 0 0 129.3947 133.0834
2771 0 0 129.5266 133.1966
2772 0 0 129.3461 132.9192
2773 0 0 129.3703 133.0168
2774 0 0 129.0805 132.6595
volatility_kcl volatility_kcw volatility_kcp volatility_kchi
2769 125.4350 5.664687 0.1707271 0
2770 125.7061 5.701335 0.3447878 0
2771 125.8566 5.666814 -0.2621132 0
2772 125.7730 5.524866 0.3342434 0
2773 125.7237 5.637341 0.4859513 0
2774 125.5015 5.545338 0.1063101 0
volatility_kcli volatility_dcl volatility_dch volatility_dcm
2769 0 117.6185 135.8658 126.7422
2770 0 117.6185 135.8658 126.7422
2771 1 117.6185 135.8658 126.7422
2772 0 117.6185 135.8658 126.7422
2773 0 117.6185 135.8658 126.7422
2774 0 118.0101 135.8658 126.9380
volatility_dcw volatility_dcp volatility_atr volatility_ui trend_macd
2769 14.55645 0.4967812 3.625843 1.907276 3.389452
2770 14.50683 0.5826176 3.587286 2.135172 3.166302
2771 14.49076 0.3460300 3.685719 2.907064 2.611005
2772 14.45401 0.5777884 3.809547 3.118267 2.483540
2773 14.39669 0.6384121 3.663536 3.247561 2.443617
2774 14.05659 0.4621708 3.644703 3.555237 2.144751
trend_macd_signal trend_macd_diff trend_sma_fast trend_sma_slow
2769 3.349603 0.03984921 128.3696 123.5512
2770 3.312943 -0.14664044 128.6307 124.1152
2771 3.172555 -0.56155020 128.4594 124.4921
2772 3.034752 -0.55121204 128.8069 124.9391
2773 2.916525 -0.47290795 129.1185 125.2903
2774 2.762170 -0.61741960 128.8819 125.5125
trend_ema_fast trend_ema_slow trend_vortex_ind_pos trend_vortex_ind_neg
2769 127.8997 124.5103 1.0102009 0.7653968
2770 127.9536 124.7872 1.0226346 0.8179947
2771 127.3349 124.7239 0.9792360 0.9288597
2772 127.4621 124.9786 0.9404317 0.9082032
2773 127.7399 125.2963 0.9611539 0.9171152
2774 127.5126 125.3679 0.9337240 0.9197245
trend_vortex_ind_diff trend_trix trend_mass_index trend_dpo trend_kst
2769 0.24480407 0.4093338 25.66873 -0.2378792 102.20648
2770 0.20463998 0.4066834 25.94140 0.2046043 103.08063
2771 0.05037637 0.3925853 26.08026 -1.9319096 98.63560
2772 0.03222857 0.3775901 26.13021 -0.7150917 93.02788
2773 0.04403875 0.3636756 26.12101 2.3553242 89.81827
2774 0.01399951 0.3455297 26.09528 1.1737675 84.46870
trend_kst_sig trend_kst_diff trend_ichimoku_conv trend_ichimoku_base
2769 91.97215 10.2343263 128.3574 124.3046
2770 95.51659 7.5640350 129.9775 124.8186
2771 97.99674 0.6388617 129.7915 125.1073
2772 99.20883 -6.1809446 129.7915 126.6736
2773 99.48599 -9.6677172 129.7915 126.7422
2774 98.15486 -13.6861689 129.7915 126.7422
trend_ichimoku_a trend_ichimoku_b trend_stc trend_adx trend_adx_pos
2769 126.3310 120.3719 49.999735 24.30929 29.41934
2770 127.3980 120.3719 24.999868 23.09144 27.45868
2771 127.4494 120.3719 12.499934 21.53552 24.93386
2772 128.2326 120.3719 6.249967 20.07573 23.64207
2773 128.2668 120.3719 3.124983 19.01872 24.49260
2774 128.2668 120.3719 1.562492 17.83205 22.78034
trend_adx_neg trend_cci trend_visual_ichimoku_a trend_visual_ichimoku_b
2769 25.43710 30.45343 113.2391 111.6453
2770 23.74184 32.78466 113.2391 111.6453
2771 25.59509 -12.14355 113.2391 111.6453
2772 23.12832 21.68439 113.2342 111.6453
2773 22.03699 44.39186 114.2082 111.6453
2774 23.90323 -12.27294 114.2082 111.6453
trend_aroon_up trend_aroon_down trend_aroon_ind trend_psar_up
2769 88 0 88 123.438
2770 84 0 84 123.438
2771 80 0 80 123.438
2772 76 0 76 123.438
2773 72 24 48 123.438
2774 68 20 48 123.438
trend_psar_down trend_psar_up_indicator trend_psar_down_indicator
2769 135.8658 0 1
2770 135.6303 0 0
2771 135.3994 0 0
2772 134.9322 0 0
2773 134.4836 0 0
2774 134.0529 0 0
momentum_rsi momentum_stoch_rsi momentum_stoch_rsi_k momentum_stoch_rsi_d
2769 54.95762 0.0000000 0.32577450 0.5522281
2770 57.72290 0.1479567 0.18788292 0.3605759
2771 48.82539 0.0000000 0.04931889 0.1876588
2772 55.98301 0.2883579 0.14543818 0.1275467
2773 57.65158 0.3555791 0.21464567 0.1364676
2774 51.89596 0.1237036 0.25588021 0.2053214
momentum_tsi momentum_uo momentum_stoch momentum_stoch_signal momentum_wr
2769 28.10394 45.54722 45.62326 61.66026 -54.37674
2770 26.12204 50.35984 49.28283 53.82103 -50.71717
2771 21.24648 43.73471 20.53449 38.48019 -79.46551
2772 19.57976 44.27394 48.69602 39.50445 -51.30398
2773 18.84834 48.97105 56.06256 41.76436 -43.93744
2774 16.25136 48.00849 36.04948 46.93602 -63.95052
momentum_ao momentum_roc momentum_ppo momentum_ppo_signal
2769 9.621153 1.196447 2.722227 2.724815
2770 8.249873 2.503702 2.537360 2.687324
2771 6.449652 -1.631708 2.093427 2.568545
2772 5.194599 3.363302 1.987172 2.452270
2773 4.481708 2.979005 1.950271 2.351870
2774 3.987060 -2.198985 1.710766 2.223649
momentum_ppo_hist momentum_pvo momentum_pvo_signal momentum_pvo_hist
2769 -0.002587856 2.357684 1.149709 1.20797511
2770 -0.149963761 1.044884 1.128744 -0.08385972
2771 -0.475117600 4.028895 1.708774 2.32012043
2772 -0.465097806 3.150377 1.997095 1.15328231
2773 -0.401599652 2.117547 2.021185 0.09636208
2774 -0.512883733 0.940333 1.805015 -0.86468179
momentum_kama others_dr others_dlr
2769 126.9639 -2.4719204 -2.5029853
2770 127.0107 1.2363705 1.2287898
2771 126.9760 -3.3661441 -3.4241031
2772 126.9853 3.4122976 3.3553701
2773 127.0043 0.8631419 0.8594382
2774 126.9913 -2.3248771 -2.3523287
ro.r(''' # Check for missing values
sum(is.na(train_data))
''')
| 0 |
ro.r(''' # Convert to data.table
library(data.table)
setDT(train_data) # Convert the train data to data.table
setDT(test_data) # Convert the test data to data.table
class(train_data) # Check to make sure conversion was done
''')
array(['data.table', 'data.frame'], dtype='<U10')
ro.r('''
class(test_data)
''')
array(['data.table', 'data.frame'], dtype='<U10')
ro.r('''
# Split the data into y ~ response and X ~ design matrix (predictors)
# Response
response <- train_data[,.(Close)]
print(head(response))
# Design Matrix
predictors <- train_data[, !"Close", with = FALSE]
print(head(predictors))
# For correlation and histogram creation
predictors <- as.data.frame(predictors)
''')
Close
Close
<num>
1: 6.454506
2: 6.465664
3: 6.362820
4: 6.351057
5: 6.393280
6: 6.336883
Date Open High Low Volume Dividends Stock Splits
<Date> <num> <num> <num> <int> <num> <num>
1: 2010-01-04 6.437013 6.469284 6.405345 493729600 0 0
2: 2010-01-05 6.472299 6.502157 6.431583 601904800 0 0
3: 2010-01-06 6.465665 6.491301 6.356185 552160000 0 0
4: 2010-01-07 6.386344 6.393885 6.304913 477131200 0 0
5: 2010-01-08 6.342612 6.393884 6.305214 447610800 0 0
6: 2010-01-11 6.418012 6.424045 6.286817 462229600 0 0
volume_adi volume_obv volume_cmf volume_fi volume_em volume_sma_em
<num> <num> <num> <num> <num> <num>
1: 265496575 493729600 0.53773680 0.0 0.000000e+00 0.0000000000
2: 244921715 1095634400 0.22354329 6716047.4 3.465412e-04 0.0003465412
3: -253007153 543474400 -0.15354291 -2355703.8 -1.055343e-03 -0.0003544007
4: -235219817 66343200 -0.11069555 -2820966.1 -1.386319e-03 -0.0006983736
5: 206296808 513954000 0.08019199 281951.9 2.976624e-06 -0.0005230360
6: 81344925 51724400 0.02680435 -3482376.2 1.746282e-04 -0.0003835032
volume_vpt volume_vwap volume_mfi volume_nvi volatility_bbm volatility_bbh
<num> <num> <num> <num> <num> <num>
1: 0 6.443045 50.00000 1000.0000 6.454506 6.454506
2: 1040521 6.455913 100.00000 1000.0000 6.460085 6.471243
3: -7742214 6.438328 52.39958 984.0939 6.427663 6.519817
4: -8624298 6.418484 37.21857 982.2745 6.408512 6.512293
5: -5648500 6.409026 50.65897 988.8049 6.405465 6.499087
6: -9725960 6.399921 41.50478 988.8049 6.394035 6.493620
volatility_bbl volatility_bbw volatility_bbp volatility_bbhi volatility_bbli
<num> <num> <num> <num> <num>
1: 6.454506 0.0000000 0.0000000 0 0
2: 6.448927 0.3454441 0.7500000 0 0
3: 6.335510 2.8674042 0.1481786 0 0
4: 6.304730 3.2388640 0.2231939 0 0
5: 6.311844 2.9231717 0.4349220 0 0
6: 6.294450 3.1149406 0.2130500 0 0
volatility_kcc volatility_kch volatility_kcl volatility_kcw volatility_kcp
<num> <num> <num> <num> <num>
1: 6.443045 6.506984 6.379106 1.984735 0.58962280
2: 6.454757 6.522013 6.387500 2.083938 0.58108717
3: 6.437650 6.527526 6.347773 2.792214 0.08370979
4: 6.415725 6.505375 6.326075 2.794706 0.13933230
5: 6.405405 6.494859 6.315951 2.793084 0.43222692
6: 6.396046 6.493462 6.298629 3.046143 0.19634120
volatility_kchi volatility_kcli volatility_dcl volatility_dch volatility_dcm
<num> <num> <num> <num> <num>
1: 0 0 6.405345 6.469284 6.437315
2: 0 0 6.405345 6.502157 6.453751
3: 0 0 6.356185 6.502157 6.429171
4: 0 0 6.304913 6.502157 6.403535
5: 0 0 6.304913 6.502157 6.403535
6: 0 0 6.286817 6.502157 6.394487
volatility_dcw volatility_dcp volatility_atr volatility_ui trend_macd
<num> <num> <num> <num> <num>
1: 0.9906056 0.76886840 0 0 0.000000000
2: 1.4986158 0.62304802 0 0 0.000890096
3: 2.2710041 0.04545593 0 0 -0.006626740
4: 3.0778545 0.23394494 0 0 -0.013378846
5: 3.0793183 0.44800890 0 0 -0.015148279
6: 3.3678253 0.23249596 0 0 -0.020860859
trend_macd_signal trend_macd_diff trend_sma_fast trend_sma_slow
<num> <num> <num> <num>
1: 0.0000000000 0.0000000000 6.454506 6.454506
2: 0.0001780192 0.0007120768 6.460085 6.460085
3: -0.0011829326 -0.0054438070 6.427663 6.427663
4: -0.0036221153 -0.0097567311 6.408512 6.408512
5: -0.0059273480 -0.0092209306 6.405465 6.405465
6: -0.0089140501 -0.0119468085 6.394035 6.394035
trend_ema_fast trend_ema_slow trend_vortex_ind_pos trend_vortex_ind_neg
<num> <num> <num> <num>
1: 6.454506 6.454506 0.000000000 0.0000000000
2: 6.456223 6.455332 0.001886121 0.0007345085
3: 6.441853 6.448480 0.003041566 0.0035689991
4: 6.427884 6.441263 0.003767603 0.0071783390
5: 6.422561 6.437709 0.005483999 0.0088830590
6: 6.409379 6.430240 0.007764440 0.0109272896
trend_vortex_ind_diff trend_trix trend_mass_index trend_dpo trend_kst
<num> <num> <num> <num> <num>
1: 0.0000000000 -8.858832e+01 1.000000 51.20881 -888.0656
2: 0.0011516129 3.376393e-04 2.016537 51.20323 -887.9689
3: -0.0005274335 -2.225728e-03 3.195459 51.23565 -888.5311
4: -0.0034107358 -6.974152e-03 4.355135 51.25480 -888.8633
5: -0.0033990600 -1.169175e-02 5.494924 51.25785 -888.9161
6: -0.0031628495 -1.787098e-02 6.714334 51.26928 -889.1143
trend_kst_sig trend_kst_diff trend_ichimoku_conv trend_ichimoku_base
<num> <num> <num> <num>
1: -888.0656 0.0000000 6.437315 6.437315
2: -888.0173 0.0483756 6.453751 6.453751
3: -888.1886 -0.3425875 6.429171 6.429171
4: -888.3572 -0.5060362 6.403535 6.403535
5: -888.4690 -0.4470929 6.403535 6.403535
6: -888.5766 -0.5377660 6.394487 6.394487
trend_ichimoku_a trend_ichimoku_b trend_stc trend_adx trend_adx_pos
<num> <num> <num> <num> <num>
1: 6.437315 6.437315 0 0 0
2: 6.453751 6.453751 0 0 0
3: 6.429171 6.429171 0 0 0
4: 6.403535 6.403535 0 0 0
5: 6.403535 6.403535 0 0 0
6: 6.394487 6.394487 0 0 0
trend_adx_neg trend_cci trend_visual_ichimoku_a trend_visual_ichimoku_b
<num> <num> <num> <num>
1: 0 0.00000 57.11322 56.24289
2: 0 66.66667 57.11322 56.24289
3: 0 -100.00000 57.11322 56.24289
4: 0 -112.34242 57.11322 56.24289
5: 0 -69.70293 57.11322 56.24289
6: 0 -74.98892 57.11322 56.24289
trend_aroon_up trend_aroon_down trend_aroon_ind trend_psar_up
<num> <num> <num> <num>
1: 0 0 0 6.15562
2: 4 0 4 6.15562
3: 4 8 -4 6.15562
4: 4 12 -8 6.15562
5: 4 12 -8 6.15562
6: 4 20 -16 6.15562
trend_psar_down trend_psar_up_indicator trend_psar_down_indicator
<num> <num> <num>
1: 6.469284 0 0
2: 6.469284 0 0
3: 6.469284 0 1
4: 6.502157 0 0
5: 6.494268 0 0
6: 6.486693 0 0
momentum_rsi momentum_stoch_rsi momentum_stoch_rsi_k momentum_stoch_rsi_d
<num> <num> <num> <num>
1: 100.000000 0 0 0
2: 100.000000 0 0 0
3: 9.152434 0 0 0
4: 8.231324 0 0 0
5: 33.933418 0 0 0
6: 24.188559 0 0 0
momentum_tsi momentum_uo momentum_stoch momentum_stoch_signal momentum_wr
<num> <num> <num> <num> <num>
1: 0.00000 0.00000 76.886840 76.88684 -23.11316
2: 100.00000 25.33655 62.304802 69.59582 -37.69520
3: 81.42043 15.10084 4.545593 47.91241 -95.45441
4: 66.95266 24.22207 23.394494 30.08163 -76.60551
5: 58.04171 39.10976 44.800890 24.24699 -55.19911
6: 45.52531 38.49324 23.249596 30.48166 -76.75040
momentum_ao momentum_roc momentum_ppo momentum_ppo_signal momentum_ppo_hist
<num> <num> <num> <num> <num>
1: 0.00000000 0 0.00000000 0.000000000 0.00000000
2: 0.00000000 0 0.01378854 0.002757708 0.01103083
3: 0.00000000 0 -0.10276437 -0.018346708 -0.08441766
4: 0.00000000 0 -0.20770532 -0.056218431 -0.15148689
5: 0.00000000 0 -0.23530543 -0.092035831 -0.14326960
6: -0.00805275 0 -0.32441803 -0.138512270 -0.18590576
momentum_pvo momentum_pvo_signal momentum_pvo_hist momentum_kama others_dr
<num> <num> <num> <num> <num>
1: 0.0000000 0.0000000 0.0000000 6.454506 0.0000000
2: 1.7198781 0.3439756 1.3759025 6.459657 0.1728713
3: 2.2401939 0.7232193 1.5169746 6.415663 -1.5906141
4: 1.4542460 0.8694246 0.5848213 6.386017 -0.1848724
5: 0.3496296 0.7654656 -0.4158360 6.389281 0.6648181
6: -0.2972685 0.5529188 -0.8501873 6.365769 -0.8821288
others_dlr
<num>
1: 0.0000000
2: 0.1727221
3: -1.6034001
4: -0.1850435
5: 0.6626179
6: -0.8860425
ro.r(''' # Data Cleaning
# Create response variable
response <- train_data[,.(Close)]
# Create design matrix (predictors)
predictors <- train_data[,!'Close', with = FALSE]
predictors <- as.data.frame(predictors)
response_df <- as.data.frame(response)
response_numeric <- as.numeric(train_data$Close)
# Extract numeric columns from predictors
numeric_predictors <- predictors[, sapply(predictors, is.numeric)]
# Verify to make sure non-numeric columns (like Date) have been removed
print(head(numeric_predictors))
''')
Open High Low Volume Dividends Open High Low Volume Dividends Stock Splits volume_adi 1 6.437013 6.469284 6.405345 493729600 0 0 265496575 2 6.472299 6.502157 6.431583 601904800 0 0 244921715 3 6.465665 6.491301 6.356185 552160000 0 0 -253007153 4 6.386344 6.393885 6.304913 477131200 0 0 -235219817 5 6.342612 6.393884 6.305214 447610800 0 0 206296808 6 6.418012 6.424045 6.286817 462229600 0 0 81344925 volume_obv volume_cmf volume_fi volume_em volume_sma_em volume_vpt 1 493729600 0.53773680 0.0 0.000000e+00 0.0000000000 0 2 1095634400 0.22354329 6716047.4 3.465412e-04 0.0003465412 1040521 3 543474400 -0.15354291 -2355703.8 -1.055343e-03 -0.0003544007 -7742214 4 66343200 -0.11069555 -2820966.1 -1.386319e-03 -0.0006983736 -8624298 5 513954000 0.08019199 281951.9 2.976624e-06 -0.0005230360 -5648500 6 51724400 0.02680435 -3482376.2 1.746282e-04 -0.0003835032 -9725960 volume_vwap volume_mfi volume_nvi volatility_bbm volatility_bbh 1 6.443045 50.00000 1000.0000 6.454506 6.454506 2 6.455913 100.00000 1000.0000 6.460085 6.471243 3 6.438328 52.39958 984.0939 6.427663 6.519817 4 6.418484 37.21857 982.2745 6.408512 6.512293 5 6.409026 50.65897 988.8049 6.405465 6.499087 6 6.399921 41.50478 988.8049 6.394035 6.493620 volatility_bbl volatility_bbw volatility_bbp volatility_bbhi volatility_bbli 1 6.454506 0.0000000 0.0000000 0 0 2 6.448927 0.3454441 0.7500000 0 0 3 6.335510 2.8674042 0.1481786 0 0 4 6.304730 3.2388640 0.2231939 0 0 5 6.311844 2.9231717 0.4349220 0 0 6 6.294450 3.1149406 0.2130500 0 0 volatility_kcc volatility_kch volatility_kcl volatility_kcw volatility_kcp 1 6.443045 6.506984 6.379106 1.984735 0.58962280 2 6.454757 6.522013 6.387500 2.083938 0.58108717 3 6.437650 6.527526 6.347773 2.792214 0.08370979 4 6.415725 6.505375 6.326075 2.794706 0.13933230 5 6.405405 6.494859 6.315951 2.793084 0.43222692 6 6.396046 6.493462 6.298629 3.046143 0.19634120 volatility_kchi volatility_kcli volatility_dcl volatility_dch volatility_dcm 1 0 0 6.405345 6.469284 6.437315 2 0 0 6.405345 6.502157 6.453751 3 0 0 6.356185 6.502157 6.429171 4 0 0 6.304913 6.502157 6.403535 5 0 0 6.304913 6.502157 6.403535 6 0 0 6.286817 6.502157 6.394487 volatility_dcw volatility_dcp volatility_atr volatility_ui trend_macd 1 0.9906056 0.76886840 0 0 0.000000000 2 1.4986158 0.62304802 0 0 0.000890096 3 2.2710041 0.04545593 0 0 -0.006626740 4 3.0778545 0.23394494 0 0 -0.013378846 5 3.0793183 0.44800890 0 0 -0.015148279 6 3.3678253 0.23249596 0 0 -0.020860859 trend_macd_signal trend_macd_diff trend_sma_fast trend_sma_slow 1 0.0000000000 0.0000000000 6.454506 6.454506 2 0.0001780192 0.0007120768 6.460085 6.460085 3 -0.0011829326 -0.0054438070 6.427663 6.427663 4 -0.0036221153 -0.0097567311 6.408512 6.408512 5 -0.0059273480 -0.0092209306 6.405465 6.405465 6 -0.0089140501 -0.0119468085 6.394035 6.394035 trend_ema_fast trend_ema_slow trend_vortex_ind_pos trend_vortex_ind_neg 1 6.454506 6.454506 0.000000000 0.0000000000 2 6.456223 6.455332 0.001886121 0.0007345085 3 6.441853 6.448480 0.003041566 0.0035689991 4 6.427884 6.441263 0.003767603 0.0071783390 5 6.422561 6.437709 0.005483999 0.0088830590 6 6.409379 6.430240 0.007764440 0.0109272896 trend_vortex_ind_diff trend_trix trend_mass_index trend_dpo trend_kst 1 0.0000000000 -8.858832e+01 1.000000 51.20881 -888.0656 2 0.0011516129 3.376393e-04 2.016537 51.20323 -887.9689 3 -0.0005274335 -2.225728e-03 3.195459 51.23565 -888.5311 4 -0.0034107358 -6.974152e-03 4.355135 51.25480 -888.8633 5 -0.0033990600 -1.169175e-02 5.494924 51.25785 -888.9161 6 -0.0031628495 -1.787098e-02 6.714334 51.26928 -889.1143 trend_kst_sig trend_kst_diff trend_ichimoku_conv trend_ichimoku_base 1 -888.0656 0.0000000 6.437315 6.437315 2 -888.0173 0.0483756 6.453751 6.453751 3 -888.1886 -0.3425875 6.429171 6.429171 4 -888.3572 -0.5060362 6.403535 6.403535 5 -888.4690 -0.4470929 6.403535 6.403535 6 -888.5766 -0.5377660 6.394487 6.394487 trend_ichimoku_a trend_ichimoku_b trend_stc trend_adx trend_adx_pos 1 6.437315 6.437315 0 0 0 2 6.453751 6.453751 0 0 0 3 6.429171 6.429171 0 0 0 4 6.403535 6.403535 0 0 0 5 6.403535 6.403535 0 0 0 6 6.394487 6.394487 0 0 0 trend_adx_neg trend_cci trend_visual_ichimoku_a trend_visual_ichimoku_b 1 0 0.00000 57.11322 56.24289 2 0 66.66667 57.11322 56.24289 3 0 -100.00000 57.11322 56.24289 4 0 -112.34242 57.11322 56.24289 5 0 -69.70293 57.11322 56.24289 6 0 -74.98892 57.11322 56.24289 trend_aroon_up trend_aroon_down trend_aroon_ind trend_psar_up trend_psar_down 1 0 0 0 6.15562 6.469284 2 4 0 4 6.15562 6.469284 3 4 8 -4 6.15562 6.469284 4 4 12 -8 6.15562 6.502157 5 4 12 -8 6.15562 6.494268 6 4 20 -16 6.15562 6.486693 trend_psar_up_indicator trend_psar_down_indicator momentum_rsi 1 0 0 100.000000 2 0 0 100.000000 3 0 1 9.152434 4 0 0 8.231324 5 0 0 33.933418 6 0 0 24.188559 momentum_stoch_rsi momentum_stoch_rsi_k momentum_stoch_rsi_d momentum_tsi 1 0 0 0 0.00000 2 0 0 0 100.00000 3 0 0 0 81.42043 4 0 0 0 66.95266 5 0 0 0 58.04171 6 0 0 0 45.52531 momentum_uo momentum_stoch momentum_stoch_signal momentum_wr momentum_ao 1 0.00000 76.886840 76.88684 -23.11316 0.00000000 2 25.33655 62.304802 69.59582 -37.69520 0.00000000 3 15.10084 4.545593 47.91241 -95.45441 0.00000000 4 24.22207 23.394494 30.08163 -76.60551 0.00000000 5 39.10976 44.800890 24.24699 -55.19911 0.00000000 6 38.49324 23.249596 30.48166 -76.75040 -0.00805275 momentum_roc momentum_ppo momentum_ppo_signal momentum_ppo_hist momentum_pvo 1 0 0.00000000 0.000000000 0.00000000 0.0000000 2 0 0.01378854 0.002757708 0.01103083 1.7198781 3 0 -0.10276437 -0.018346708 -0.08441766 2.2401939 4 0 -0.20770532 -0.056218431 -0.15148689 1.4542460 5 0 -0.23530543 -0.092035831 -0.14326960 0.3496296 6 0 -0.32441803 -0.138512270 -0.18590576 -0.2972685 momentum_pvo_signal momentum_pvo_hist momentum_kama others_dr others_dlr 1 0.0000000 0.0000000 6.454506 0.0000000 0.0000000 2 0.3439756 1.3759025 6.459657 0.1728713 0.1727221 3 0.7232193 1.5169746 6.415663 -1.5906141 -1.6034001 4 0.8694246 0.5848213 6.386017 -0.1848724 -0.1850435 5 0.7654656 -0.4158360 6.389281 0.6648181 0.6626179 6 0.5529188 -0.8501873 6.365769 -0.8821288 -0.8860425
# Step 1: Perform the correlation-based filtering in R
ro.r('''
# Initialize a vector to store relevant variable names
relevant_predictors <- c()
# Loop through each predictor (column), calculate its correlation to the response
for (colname in colnames(numeric_predictors)) {
# Correlation to response (Close)
correlation <- cor(numeric_predictors[[colname]], response$Close, use = "complete.obs")
# If the absolute correlation meets the threshold, print and store
if (abs(correlation) > 0.2) {
cat("Analyzing:", colname, " ")
cat("Correlation with Close:", correlation, "\n")
# Append the relevant predictors to the vector
relevant_predictors <- c(relevant_predictors, colname)
}
}
# Extract only the relevant predictors from numeric_predictors
corr_based_filtered_predictors <- numeric_predictors[, relevant_predictors, drop=FALSE]
print(head(corr_based_filtered_predictors))
''')
Analyzing: Open Open Correlation with Close: 0.9996517
Analyzing: High Correlation with Close: 0.9998147
Analyzing: Low Correlation with Close: 0.9998257
Analyzing: Volume Correlation with Close: -0.4878077
Analyzing: volume_adi Correlation with Close: 0.706551
Analyzing: volume_obv Correlation with Close: 0.5390079
Analyzing: volume_vpt Correlation with Close: 0.4129055
Analyzing: volume_vwap Correlation with Close: 0.9979931
Analyzing: volume_nvi Correlation with Close: 0.9791192
Analyzing: volatility_bbm Correlation with Close: 0.9968251
Analyzing: volatility_bbh Correlation with Close: 0.9955588
Analyzing: volatility_bbl Correlation with Close: 0.995199
Analyzing: volatility_bbw Correlation with Close: 0.2045607
Analyzing: volatility_kcc Correlation with Close: 0.9984521
Analyzing: volatility_kch Correlation with Close: 0.9980442
Analyzing: volatility_kcl Correlation with Close: 0.9986193
Analyzing: volatility_kcw Correlation with Close: 0.2045559
Analyzing: volatility_dcl Correlation with Close: 0.995478
Analyzing: volatility_dch Correlation with Close: 0.9952855
Analyzing: volatility_dcm Correlation with Close: 0.9966783
Analyzing: volatility_dcw Correlation with Close: 0.2319696
Analyzing: volatility_atr Correlation with Close: 0.8684671
Analyzing: trend_macd Correlation with Close: 0.5134068
Analyzing: trend_macd_signal Correlation with Close: 0.5287249
Analyzing: trend_sma_fast Correlation with Close: 0.9981969
Analyzing: trend_sma_slow Correlation with Close: 0.9957039
Analyzing: trend_ema_fast Correlation with Close: 0.9986977
Analyzing: trend_ema_slow Correlation with Close: 0.9969542
Analyzing: trend_kst Correlation with Close: 0.2163967
Analyzing: trend_kst_sig Correlation with Close: 0.218394
Analyzing: trend_ichimoku_conv Correlation with Close: 0.9986494
Analyzing: trend_ichimoku_base Correlation with Close: 0.9956461
Analyzing: trend_ichimoku_a Correlation with Close: 0.9977339
Analyzing: trend_ichimoku_b Correlation with Close: 0.9929489
Analyzing: trend_visual_ichimoku_a Correlation with Close: 0.9604087
Analyzing: trend_visual_ichimoku_b Correlation with Close: 0.9540602
Analyzing: trend_psar_up Correlation with Close: 0.994305
Analyzing: trend_psar_down Correlation with Close: 0.9918489
Analyzing: momentum_ao Correlation with Close: 0.4394496
Analyzing: momentum_ppo Correlation with Close: 0.2081405
Analyzing: momentum_ppo_signal Correlation with Close: 0.2168345
Analyzing: momentum_kama Correlation with Close: 0.9979848
Open High Low Volume volume_adi volume_obv volume_vpt
1 6.437013 6.469284 6.405345 493729600 265496575 493729600 0
2 6.472299 6.502157 6.431583 601904800 244921715 1095634400 1040521
3 6.465665 6.491301 6.356185 552160000 -253007153 543474400 -7742214
4 6.386344 6.393885 6.304913 477131200 -235219817 66343200 -8624298
5 6.342612 6.393884 6.305214 447610800 206296808 513954000 -5648500
6 6.418012 6.424045 6.286817 462229600 81344925 51724400 -9725960
volume_vwap volume_nvi volatility_bbm volatility_bbh volatility_bbl
1 6.443045 1000.0000 6.454506 6.454506 6.454506
2 6.455913 1000.0000 6.460085 6.471243 6.448927
3 6.438328 984.0939 6.427663 6.519817 6.335510
4 6.418484 982.2745 6.408512 6.512293 6.304730
5 6.409026 988.8049 6.405465 6.499087 6.311844
6 6.399921 988.8049 6.394035 6.493620 6.294450
volatility_bbw volatility_kcc volatility_kch volatility_kcl volatility_kcw
1 0.0000000 6.443045 6.506984 6.379106 1.984735
2 0.3454441 6.454757 6.522013 6.387500 2.083938
3 2.8674042 6.437650 6.527526 6.347773 2.792214
4 3.2388640 6.415725 6.505375 6.326075 2.794706
5 2.9231717 6.405405 6.494859 6.315951 2.793084
6 3.1149406 6.396046 6.493462 6.298629 3.046143
volatility_dcl volatility_dch volatility_dcm volatility_dcw volatility_atr
1 6.405345 6.469284 6.437315 0.9906056 0
2 6.405345 6.502157 6.453751 1.4986158 0
3 6.356185 6.502157 6.429171 2.2710041 0
4 6.304913 6.502157 6.403535 3.0778545 0
5 6.304913 6.502157 6.403535 3.0793183 0
6 6.286817 6.502157 6.394487 3.3678253 0
trend_macd trend_macd_signal trend_sma_fast trend_sma_slow trend_ema_fast
1 0.000000000 0.0000000000 6.454506 6.454506 6.454506
2 0.000890096 0.0001780192 6.460085 6.460085 6.456223
3 -0.006626740 -0.0011829326 6.427663 6.427663 6.441853
4 -0.013378846 -0.0036221153 6.408512 6.408512 6.427884
5 -0.015148279 -0.0059273480 6.405465 6.405465 6.422561
6 -0.020860859 -0.0089140501 6.394035 6.394035 6.409379
trend_ema_slow trend_kst trend_kst_sig trend_ichimoku_conv
1 6.454506 -888.0656 -888.0656 6.437315
2 6.455332 -887.9689 -888.0173 6.453751
3 6.448480 -888.5311 -888.1886 6.429171
4 6.441263 -888.8633 -888.3572 6.403535
5 6.437709 -888.9161 -888.4690 6.403535
6 6.430240 -889.1143 -888.5766 6.394487
trend_ichimoku_base trend_ichimoku_a trend_ichimoku_b trend_visual_ichimoku_a
1 6.437315 6.437315 6.437315 57.11322
2 6.453751 6.453751 6.453751 57.11322
3 6.429171 6.429171 6.429171 57.11322
4 6.403535 6.403535 6.403535 57.11322
5 6.403535 6.403535 6.403535 57.11322
6 6.394487 6.394487 6.394487 57.11322
trend_visual_ichimoku_b trend_psar_up trend_psar_down momentum_ao
1 56.24289 6.15562 6.469284 0.00000000
2 56.24289 6.15562 6.469284 0.00000000
3 56.24289 6.15562 6.469284 0.00000000
4 56.24289 6.15562 6.502157 0.00000000
5 56.24289 6.15562 6.494268 0.00000000
6 56.24289 6.15562 6.486693 -0.00805275
momentum_ppo momentum_ppo_signal momentum_kama
1 0.00000000 0.000000000 6.454506
2 0.01378854 0.002757708 6.459657
3 -0.10276437 -0.018346708 6.415663
4 -0.20770532 -0.056218431 6.386017
5 -0.23530543 -0.092035831 6.389281
6 -0.32441803 -0.138512270 6.365769
ro.r(''' # Data parsing check
# Expecting 93
print(ncol(train_data))
# Expecting 92
print(ncol(predictors))
# Expecting 91
print(ncol(numeric_predictors))
# Expecting 42
print(ncol(corr_based_filtered_predictors))
''')
[1] 93 93 [1] 92 [1] 91 [1] 42
ro.r(''' # Number of missing values
sum(is.na(corr_based_filtered_predictors))
''')
| 0 |
import rpy2.robjects as ro # Step 2: Generate histograms based on filtered predictors and the Close variable
from rpy2.robjects.lib import grdevices
from IPython.display import Image, display
def display_histograms():
# Fetch column names from the filtered predictors
colnames = list(ro.r('colnames(corr_based_filtered_predictors)'))
# Add the 'Close' column separately as response
colnames.append("Close")
for colname in colnames:
colname_str = str(colname) # Convert to Python string
filename = f"histogram_{colname_str}.png" # Save each histogram as a PNG file
# Determine which dataset to use (filtered predictors or response)
if colname_str == 'Close':
# For 'Close', use the numeric vector
dataset = "response_numeric"
ro.r(f'''
# Open a PNG plotting device in R
png(file="{filename}", width=512, height=512)
# Generate the histogram using R
hist({dataset}, probability = TRUE,
main="Histogram of Close",
xlab="Close",
col="lightblue",
border="black")
# Add a normal distribution curve
curve(dnorm(x, mean=mean({dataset}, na.rm=TRUE),
sd=sd({dataset}, na.rm=TRUE)),
col="red", lwd=2, add=TRUE)
dev.off()
''')
else:
# For the predictors, use corr_based_filtered_predictors dataset
dataset = "corr_based_filtered_predictors"
ro.r(f'''
# Open a PNG plotting device in R
png(file="{filename}", width=512, height=512)
# Generate the histogram using R
hist({dataset}[["{colname_str}"]], probability = TRUE,
main=paste("Histogram of", "{colname_str}"),
xlab="{colname_str}",
col="lightblue",
border="black")
# Add a normal distribution curve
curve(dnorm(x, mean=mean({dataset}[["{colname_str}"]], na.rm=TRUE),
sd=sd({dataset}[["{colname_str}"]], na.rm=TRUE)),
col="red", lwd=2, add=TRUE)
dev.off()
''')
# Display the histogram in Python
display(Image(filename))
# Call the function to generate and display histograms
display_histograms()
Data Analysis Insight: After inspecting the histograms for the predictor variables, it is evident that the majority exhibit right-skewness. To stabilize the skewed distributions and improve normality, we will apply log transformations to these variables. This will allow us to better analyze the relationships between predictors and the response variable.
Dynamic Programming Approach: To demonstrate flexibility and dynamic programming skills, I will perform the Box-Cox transformation analysis in Python. This approach enables me to systematically evaluate whether a log transformation or an alternative power transformation is appropriate for each variable, based on their distribution. By leveraging the interoperability between Python and R, I can efficiently switch between these languages to handle different aspects of the analysis and visualization to suit my personal strengths.
# Load R object into Python
data = ro.r('corr_based_filtered_predictors')
ncol_data = ro.r('colnames(corr_based_filtered_predictors)')
# Convert R dataframe to a pandas dataframe using conversion
with conversion.localconverter(ro.default_converter + pandas2ri.converter):
data = ro.conversion.rpy2py(data)
# Ensure it's a pandas dataframe
data = pd.DataFrame(data)
# Create dictionaries to store columns and the data associated with columns
positive_only_indicators = {}
needs_log = {}
no_log_transform = {}
unsure_indicators = {}
# Loop through each column in the dataframe
for column in data.columns:
# Add small value to indicators that range from [0,100]
if (data[column] <= 0).any():
data[column] += 0.001
# After adjusting, check if the column has only positive values for Box-Cox
if (data[column] > 0).all():
# Apply Box-Cox transformation
_, lambda_val = stats.boxcox(data[column].values) # Use .values to pass as numpy array
positive_only_indicators[column] = data[column] # Store the data for the positive-only indicator
# Check if the lambda suggests log transformation (lambda ~ 0) or no transformation (lambda ~ 1)
if -0.125 < lambda_val < 0.125:
print(f"{column} is best log transformed (lambda ~ 0)")
needs_log[column] = data[column] # Store the data for log transformation
elif 0.875 < lambda_val < 1.125:
print(f"{column} does not need transformation (lambda ~ 1)")
no_log_transform[column] = data[column] # Store the data for no log transformation
else:
print(f"{column} is best transformed with lambda: {lambda_val}")
unsure_indicators[column] = data[column] # Store the data for unsure indicators
# Convert 'ncol_data' (an R vector) to a Python list
ncol_data = list(ncol_data)
# Find and print the names of the columns that are in 'ncol_data' but not in 'positive_only_indicators'
non_positive_indicators = list(set(ncol_data) - set(positive_only_indicators.keys()))
# Update unsure_indicators with non-positive columns
for col in non_positive_indicators:
unsure_indicators[col] = data[col]
Open is best log transformed (lambda ~ 0) High is best transformed with lambda: -0.12753048686396823 Low is best log transformed (lambda ~ 0) Volume is best transformed with lambda: -0.19849451185563632 volume_vwap is best log transformed (lambda ~ 0) volume_nvi is best log transformed (lambda ~ 0) volatility_bbm is best log transformed (lambda ~ 0) volatility_bbh is best transformed with lambda: -0.1259423275458452 volatility_bbl is best log transformed (lambda ~ 0) volatility_bbw is best transformed with lambda: 0.2872076581878337 volatility_kcc is best log transformed (lambda ~ 0) volatility_kch is best log transformed (lambda ~ 0) volatility_kcl is best log transformed (lambda ~ 0) volatility_kcw is best transformed with lambda: -0.3844549802996423 volatility_dcl is best log transformed (lambda ~ 0) volatility_dch is best transformed with lambda: -0.12881070304476055 volatility_dcm is best log transformed (lambda ~ 0) volatility_dcw is best log transformed (lambda ~ 0) volatility_atr is best log transformed (lambda ~ 0) trend_sma_fast is best log transformed (lambda ~ 0) trend_sma_slow is best log transformed (lambda ~ 0) trend_ema_fast is best log transformed (lambda ~ 0) trend_ema_slow is best log transformed (lambda ~ 0) trend_ichimoku_conv is best log transformed (lambda ~ 0) trend_ichimoku_base is best log transformed (lambda ~ 0) trend_ichimoku_a is best log transformed (lambda ~ 0) trend_ichimoku_b is best log transformed (lambda ~ 0) trend_visual_ichimoku_a is best log transformed (lambda ~ 0) trend_visual_ichimoku_b is best log transformed (lambda ~ 0) trend_psar_up is best log transformed (lambda ~ 0) trend_psar_down is best log transformed (lambda ~ 0) momentum_kama is best log transformed (lambda ~ 0)
# Expecting total number of indicators based on initial correlation filtering
print(f"Total number of correlation filtered indicators: {len(ncol_data)}")
# Expecting total number of indicators that are positive and pass initial filtering
print(f"Total number of positive-only indicators: {len(positive_only_indicators)}")
# Print number of non-positive indicators
print(f"Total number of non-positive indicators: {len(non_positive_indicators)}")
# Print number of non-positive indicators
print(f"Total number of indicators needing log transformation: {len(needs_log)}")
# Print number of no log transformation indicators
print(f"Total number of indicators with no log transformation: {len(no_log_transform)}")
# Print number of unsure indicators
print(f"Total number of indicators unsure: {len(unsure_indicators)}")
# Print names of non-positive indicators
print(f"Non-positive indicators:({non_positive_indicators})")
Total number of correlation filtered indicators: 42 Total number of positive-only indicators: 32 Total number of non-positive indicators: 10 Total number of indicators needing log transformation: 26 Total number of indicators with no log transformation: 0 Total number of indicators unsure: 16 Non-positive indicators:(['trend_macd_signal', 'volume_adi', 'trend_kst_sig', 'momentum_ppo', 'momentum_ao', 'momentum_ppo_signal', 'volume_vpt', 'volume_obv', 'trend_kst', 'trend_macd'])
print("\nLog needing indicators:")
print('\n'.join(needs_log.keys()))
Log needing indicators: Open Low volume_vwap volume_nvi volatility_bbm volatility_bbl volatility_kcc volatility_kch volatility_kcl volatility_dcl volatility_dcm volatility_dcw volatility_atr trend_sma_fast trend_sma_slow trend_ema_fast trend_ema_slow trend_ichimoku_conv trend_ichimoku_base trend_ichimoku_a trend_ichimoku_b trend_visual_ichimoku_a trend_visual_ichimoku_b trend_psar_up trend_psar_down momentum_kama
print("\nNo log transformation indicators:")
print('\n'.join(no_log_transform.keys()))
No log transformation indicators:
print("\nUnsure indicators:")
print('\n'.join(unsure_indicators.keys()))
Unsure indicators: High Volume volatility_bbh volatility_bbw volatility_kcw volatility_dch trend_macd_signal volume_adi trend_kst_sig momentum_ppo momentum_ao momentum_ppo_signal volume_vpt volume_obv trend_kst trend_macd
Now we will evaluate the unsure indicators using skew and kurtosis and from here, fit a full model to finalize initial base model creation, having started with 90+ predictors for the response variable.
for column in unsure_indicators:
print(f"{column}: Skewness = {skew(data[column])}, Kurtosis = {kurtosis(data[column])}")
High: Skewness = 1.8891342657374528, Kurtosis = 3.753443913653781 Volume: Skewness = 1.8848105638955532, Kurtosis = 5.024020530989679 volatility_bbh: Skewness = 1.8956196401042602, Kurtosis = 3.774120527621692 volatility_bbw: Skewness = 1.053329940447001, Kurtosis = 1.366980568998037 volatility_kcw: Skewness = 2.0274820774137092, Kurtosis = 7.04124288432811 volatility_dch: Skewness = 1.9355368679210487, Kurtosis = 4.007725423404618 trend_macd_signal: Skewness = 1.8399775336285498, Kurtosis = 12.028288152799043 volume_adi: Skewness = -1.1886506344136536, Kurtosis = 2.6198886420116176 trend_kst_sig: Skewness = -4.60234715364812, Kurtosis = 32.64799344463351 momentum_ppo: Skewness = -0.4831524800412622, Kurtosis = 0.6810283720748171 momentum_ao: Skewness = 1.415658601814927, Kurtosis = 11.674507344282445 momentum_ppo_signal: Skewness = -0.4789048444032326, Kurtosis = 0.7012434443939171 volume_vpt: Skewness = 0.14093567543649085, Kurtosis = 0.6253228337353658 volume_obv: Skewness = -0.831913349454897, Kurtosis = 1.6495233192559935 trend_kst: Skewness = -4.413882253592722, Kurtosis = 32.381441966048996 trend_macd: Skewness = 1.814833255851343, Kurtosis = 12.371411900001323
import rpy2.robjects as ro # Data Preparation: Applying Transformations
from rpy2.robjects.lib import grdevices
from IPython.display import Image, display
import numpy as np
import pandas as pd
from rpy2.robjects import pandas2ri
pandas2ri.activate()
# Convert to df for easier analysis
needs_log_df = pd.DataFrame(needs_log)
no_log_transform_df = pd.DataFrame(no_log_transform)
unsure_indicators_df = pd.DataFrame(unsure_indicators)
# List of columns to add to needs_log_df (log transformation)
additional_log_columns = ["High", "Volume", "volatility_bbh", "volatility_dch", "volatility_kcw"]
# Add these columns to needs_log_df
needs_log_df = pd.concat([needs_log_df, data[additional_log_columns]], axis=1)
# List of all columns that need cube transformation instead of cube root
cube_columns = ["volume_adi", "volume_obv", "trend_kst_sig", "trend_kst", "momentum_ao"]
# Apply cube transformation
cube_df = data[cube_columns].apply(lambda x: np.power(x, 3)) # Cube transformation
# List of columns to be added as normal predictors (no transformation needed)
normal_predictors_columns = ["volume_vpt", "momentum_ppo", "momentum_ppo_signal", "volatility_bbw", "trend_macd_signal", "trend_macd"]
# Create normal_predictors_df
normal_predictors_df = data[normal_predictors_columns]
# Apply log transformation to needs_log_df
needs_log_df = needs_log_df.apply(np.log)
# Combine all DataFrames (log transformed, cubed, and normal predictors)
final_df = pd.concat([needs_log_df, cube_df, normal_predictors_df], axis=1)
# Add the response variable and apply log transformation
response_r = ro.r('train_data[["Close"]]') # Fetch the correct response from R
response_r = np.log(response_r) # Log transformation of the response
response_r = pd.DataFrame(response_r)
final_df['log_Close'] = response_r.to_numpy()[:, 0]
# Drop any rows with missing values
final_df = final_df.dropna(axis=0)
# Ensure no columns are missing from the transformations
all_transformed_columns = list(needs_log_df.columns) + list(cube_df.columns) + list(normal_predictors_df.columns)
# Find and print the names of the columns that are tricky based on box cox, skewness, and kurtosis, likely due to outliers
predictors_need_care = list(set(data.columns) - set(all_transformed_columns))
# Check if predictors_need_care is empty
if not predictors_need_care:
print("All columns have been transformed successfully.")
else:
print(f"The following columns still need care: {predictors_need_care}")
predictors_need_care_df = data[predictors_need_care] # Create df for tricky predictors
All columns have been transformed successfully.
# Data Visualization: Renaming Columns and Displaying Histograms
def display_transformed_histograms():
# Create a copy of final_df to avoid modifying the original data
transformed_df = final_df.copy()
# Get the column names from final_df
colnames = final_df.columns
# Loop through each column and apply transformation logic
for colname in colnames:
colname_str = str(colname)
# Determine the type of transformation applied
if colname_str in needs_log_df.columns:
transform_type = 'Log-Transformed'
new_colname = f"log_{colname_str}"
transformed_df.rename(columns={colname_str: new_colname}, inplace=True)
elif colname_str in cube_df.columns:
transform_type = 'Cube Transformed'
new_colname = f"cube_{colname_str}"
transformed_df.rename(columns={colname_str: new_colname}, inplace=True)
elif colname_str == "log_Close": # Explicitly handle log_Close
transform_type = 'Log-Transformed'
new_colname = colname_str
else:
transform_type = 'Untransformed'
new_colname = colname_str
# Create a filename for the histogram
filename = f"histogram_{new_colname}.png"
# Convert the transformed DataFrame to an R DataFrame
transformed_df_r = pandas2ri.py2rpy(transformed_df)
# Assign the R dataframe to a variable in R
ro.globalenv['transformed_df'] = transformed_df_r
# Open a PNG plotting device in R
grdevices.png(file=filename, width=512, height=512)
# Prepare the R code for plotting
ro.r(f'''
# Extract the data column
data_vector <- transformed_df[["{new_colname}"]]
# Plot the histogram
hist(data_vector, probability = TRUE,
main=paste("{transform_type} Histogram of", "{colname}"),
xlab=paste("{transform_type}", "{colname}"),
col="lightblue",
border="black")
# Add a normal distribution curve
curve(dnorm(x, mean=mean(data_vector, na.rm=TRUE), sd=sd(data_vector, na.rm=TRUE)),
col="red", lwd=2, add=TRUE)
''')
# Close the PNG device
grdevices.dev_off()
# Display the histogram in Python
display(Image(filename))
return transformed_df
# Call the function to display histograms and return the transformed DataFrame
transformed_df = display_transformed_histograms()
transformed_df.head()
# this is as desired
| log_Open | log_Low | log_volume_vwap | log_volume_nvi | log_volatility_bbm | log_volatility_bbl | log_volatility_kcc | log_volatility_kch | log_volatility_kcl | log_volatility_dcl | ... | cube_trend_kst_sig | cube_trend_kst | cube_momentum_ao | volume_vpt | momentum_ppo | momentum_ppo_signal | volatility_bbw | trend_macd_signal | trend_macd | log_Close | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 1.862065 | 1.857133 | 1.863001 | 6.907755 | 1.864778 | 1.864778 | 1.863001 | 1.872876 | 1.853028 | 1.857133 | ... | -7.003800e+08 | -7.003800e+08 | 1.000000e-09 | 1.000000e-03 | 0.001000 | 0.001000 | 0.001000 | 0.001000 | 0.001000 | 1.864778 |
| 2 | 1.867531 | 1.861221 | 1.864996 | 6.907755 | 1.865642 | 1.863914 | 1.864817 | 1.875183 | 1.854343 | 1.857133 | ... | -7.002655e+08 | -7.001511e+08 | 1.000000e-09 | 1.040521e+06 | 0.014789 | 0.003758 | 0.346444 | 0.001178 | 0.001890 | 1.866506 |
| 3 | 1.866506 | 1.849428 | 1.862269 | 6.891721 | 1.860611 | 1.846170 | 1.862163 | 1.876028 | 1.848104 | 1.849428 | ... | -7.006709e+08 | -7.014819e+08 | 1.000000e-09 | -7.742214e+06 | -0.101764 | -0.017347 | 2.868404 | -0.000183 | -0.005627 | 1.850472 |
| 4 | 1.854162 | 1.841329 | 1.859182 | 6.889871 | 1.857627 | 1.841300 | 1.858752 | 1.872629 | 1.844680 | 1.841329 | ... | -7.010701e+08 | -7.022689e+08 | 1.000000e-09 | -8.624298e+06 | -0.206705 | -0.055218 | 3.239864 | -0.002622 | -0.012379 | 1.848621 |
| 5 | 1.847291 | 1.841377 | 1.857707 | 6.896497 | 1.857152 | 1.842428 | 1.857142 | 1.871011 | 1.843078 | 1.841329 | ... | -7.013348e+08 | -7.023941e+08 | 1.000000e-09 | -5.648500e+06 | -0.234305 | -0.091036 | 2.924172 | -0.004927 | -0.014148 | 1.855247 |
5 rows × 43 columns
Cube Root transformations were not good, went back and did Cube transformations, these were better. Some variables likely have outliers making simple transformations ineffective. We need to take a closer look at these variables later, potentiall using robust regression methods to take care of outliers.
# Convert needs_log_df to R DataFrame and move to R environment
needs_log_df_r = pandas2ri.py2rpy(needs_log_df.dropna()) # Use dropna to handle potential errors
ro.globalenv['needs_log_df'] = needs_log_df_r
ro.globalenv['final_df'] = final_df
print(ro.r('head(needs_log_df)'))
Open Low volume_vwap volume_nvi volatility_bbm volatility_bbl 1 1.862065 1.857133 1.863001 6.907755 1.864778 1.864778 2 1.867531 1.861221 1.864996 6.907755 1.865642 1.863914 3 1.866506 1.849428 1.862269 6.891721 1.860611 1.846170 4 1.854162 1.841329 1.859182 6.889871 1.857627 1.841300 5 1.847291 1.841377 1.857707 6.896497 1.857152 1.842428 6 1.859108 1.838455 1.856286 6.896497 1.855366 1.839668 volatility_kcc volatility_kch volatility_kcl volatility_dcl volatility_dcm 1 1.863001 1.872876 1.853028 1.857133 1.862112 2 1.864817 1.875183 1.854343 1.857133 1.864662 3 1.862163 1.876028 1.848104 1.849428 1.860846 4 1.858752 1.872629 1.844680 1.841329 1.856850 5 1.857142 1.871011 1.843078 1.841329 1.856850 6 1.855680 1.870796 1.840332 1.838455 1.855436 volatility_dcw volatility_atr trend_sma_fast trend_sma_slow trend_ema_fast 1 -0.009438764 -6.907755 1.864778 1.864778 1.864778 2 0.404541909 -6.907755 1.865642 1.865642 1.865044 3 0.820222064 -6.907755 1.860611 1.860611 1.862816 4 1.124232772 -6.907755 1.857627 1.857627 1.860645 5 1.124708244 -6.907755 1.857152 1.857152 1.859817 6 1.214267211 -6.907755 1.855366 1.855366 1.857762 trend_ema_slow trend_ichimoku_conv trend_ichimoku_base trend_ichimoku_a 1 1.864778 1.862112 1.862112 1.862112 2 1.864907 1.864662 1.864662 1.864662 3 1.863844 1.860846 1.860846 1.860846 4 1.862725 1.856850 1.856850 1.856850 5 1.862173 1.856850 1.856850 1.856850 6 1.861012 1.855436 1.855436 1.855436 trend_ichimoku_b trend_visual_ichimoku_a trend_visual_ichimoku_b 1 1.862112 4.045036 4.02968 2 1.864662 4.045036 4.02968 3 1.860846 4.045036 4.02968 4 1.856850 4.045036 4.02968 5 1.856850 4.045036 4.02968 6 1.855436 4.045036 4.02968 trend_psar_up trend_psar_down momentum_kama High Volume volatility_bbh 1 1.817366 1.867065 1.864778 1.867065 20.01750 1.864778 2 1.817366 1.867065 1.865576 1.872134 20.21561 1.867368 3 1.817366 1.867065 1.858742 1.870463 20.12935 1.874846 4 1.817366 1.872134 1.854111 1.855342 19.98330 1.873692 5 1.817366 1.870920 1.854622 1.855342 19.91943 1.871662 6 1.817366 1.869753 1.850935 1.860048 19.95157 1.870820 volatility_dch volatility_kcw 1 1.867065 0.6854856 2 1.872134 0.7342592 3 1.872134 1.0268348 4 1.872134 1.0277271 5 1.872134 1.0271463 6 1.872134 1.1138761
# Convert normal_predictors_df to R DataFrame and move to R environment
normal_predictors_df_r = pandas2ri.py2rpy(normal_predictors_df.dropna()) # Use dropna to handle potential errors
ro.globalenv['normal_predictors_df'] = normal_predictors_df_r
print(ro.r('head(normal_predictors_df)'))
volume_vpt momentum_ppo momentum_ppo_signal volatility_bbw 1 0.001 0.00100000 0.001000000 0.0010000 2 1040520.766 0.01478854 0.003757708 0.3464441 3 -7742214.017 -0.10176437 -0.017346708 2.8684042 4 -8624297.733 -0.20670532 -0.055218431 3.2398640 5 -5648500.127 -0.23430543 -0.091035831 2.9241717 6 -9725960.320 -0.32341803 -0.137512270 3.1159406 trend_macd_signal trend_macd 1 0.0010000000 0.001000000 2 0.0011780192 0.001890096 3 -0.0001829326 -0.005626740 4 -0.0026221153 -0.012378846 5 -0.0049273480 -0.014148279 6 -0.0079140501 -0.019860859
# Convert cube_root_df to R DataFrame and move to R environment
cube_df_r = pandas2ri.py2rpy(cube_df.dropna()) # Use dropna to handle potential errors
ro.globalenv['cube_df'] = cube_df_r
print(ro.r('head(cube_df)'))
volume_adi volume_obv trend_kst_sig trend_kst momentum_ao 1 1.871444e+25 1.203559e+26 -700379993 -700379993 1.000000e-09 2 1.469203e+25 1.315216e+27 -700265544 -700151107 1.000000e-09 3 -1.619565e+25 1.605230e+26 -700670855 -701481946 1.000000e-09 4 -1.301433e+25 2.920043e+23 -701070131 -702268869 1.000000e-09 5 8.779657e+24 1.357603e+26 -701334790 -702394096 1.000000e-09 6 5.382591e+23 1.383842e+23 -701589521 -702864098 -3.508128e-07
print(ro.r('head(final_df)')) # In R, final_df has log_Close
Open Low volume_vwap volume_nvi volatility_bbm volatility_bbl 1 1.862065 1.857133 1.863001 6.907755 1.864778 1.864778 2 1.867531 1.861221 1.864996 6.907755 1.865642 1.863914 3 1.866506 1.849428 1.862269 6.891721 1.860611 1.846170 4 1.854162 1.841329 1.859182 6.889871 1.857627 1.841300 5 1.847291 1.841377 1.857707 6.896497 1.857152 1.842428 6 1.859108 1.838455 1.856286 6.896497 1.855366 1.839668 volatility_kcc volatility_kch volatility_kcl volatility_dcl volatility_dcm 1 1.863001 1.872876 1.853028 1.857133 1.862112 2 1.864817 1.875183 1.854343 1.857133 1.864662 3 1.862163 1.876028 1.848104 1.849428 1.860846 4 1.858752 1.872629 1.844680 1.841329 1.856850 5 1.857142 1.871011 1.843078 1.841329 1.856850 6 1.855680 1.870796 1.840332 1.838455 1.855436 volatility_dcw volatility_atr trend_sma_fast trend_sma_slow trend_ema_fast 1 -0.009438764 -6.907755 1.864778 1.864778 1.864778 2 0.404541909 -6.907755 1.865642 1.865642 1.865044 3 0.820222064 -6.907755 1.860611 1.860611 1.862816 4 1.124232772 -6.907755 1.857627 1.857627 1.860645 5 1.124708244 -6.907755 1.857152 1.857152 1.859817 6 1.214267211 -6.907755 1.855366 1.855366 1.857762 trend_ema_slow trend_ichimoku_conv trend_ichimoku_base trend_ichimoku_a 1 1.864778 1.862112 1.862112 1.862112 2 1.864907 1.864662 1.864662 1.864662 3 1.863844 1.860846 1.860846 1.860846 4 1.862725 1.856850 1.856850 1.856850 5 1.862173 1.856850 1.856850 1.856850 6 1.861012 1.855436 1.855436 1.855436 trend_ichimoku_b trend_visual_ichimoku_a trend_visual_ichimoku_b 1 1.862112 4.045036 4.02968 2 1.864662 4.045036 4.02968 3 1.860846 4.045036 4.02968 4 1.856850 4.045036 4.02968 5 1.856850 4.045036 4.02968 6 1.855436 4.045036 4.02968 trend_psar_up trend_psar_down momentum_kama High Volume volatility_bbh 1 1.817366 1.867065 1.864778 1.867065 20.01750 1.864778 2 1.817366 1.867065 1.865576 1.872134 20.21561 1.867368 3 1.817366 1.867065 1.858742 1.870463 20.12935 1.874846 4 1.817366 1.872134 1.854111 1.855342 19.98330 1.873692 5 1.817366 1.870920 1.854622 1.855342 19.91943 1.871662 6 1.817366 1.869753 1.850935 1.860048 19.95157 1.870820 volatility_dch volatility_kcw volume_adi volume_obv trend_kst_sig 1 1.867065 0.6854856 1.871444e+25 1.203559e+26 -700379993 2 1.872134 0.7342592 1.469203e+25 1.315216e+27 -700265544 3 1.872134 1.0268348 -1.619565e+25 1.605230e+26 -700670855 4 1.872134 1.0277271 -1.301433e+25 2.920043e+23 -701070131 5 1.872134 1.0271463 8.779657e+24 1.357603e+26 -701334790 6 1.872134 1.1138761 5.382591e+23 1.383842e+23 -701589521 trend_kst momentum_ao volume_vpt momentum_ppo momentum_ppo_signal 1 -700379993 1.000000e-09 0.001 0.00100000 0.001000000 2 -700151107 1.000000e-09 1040520.766 0.01478854 0.003757708 3 -701481946 1.000000e-09 -7742214.017 -0.10176437 -0.017346708 4 -702268869 1.000000e-09 -8624297.733 -0.20670532 -0.055218431 5 -702394096 1.000000e-09 -5648500.127 -0.23430543 -0.091035831 6 -702864098 -3.508128e-07 -9725960.320 -0.32341803 -0.137512270 volatility_bbw trend_macd_signal trend_macd log_Close 1 0.0010000 0.0010000000 0.001000000 1.864778 2 0.3464441 0.0011780192 0.001890096 1.866506 3 2.8684042 -0.0001829326 -0.005626740 1.850472 4 3.2398640 -0.0026221153 -0.012378846 1.848621 5 2.9241717 -0.0049273480 -0.014148279 1.855247 6 3.1159406 -0.0079140501 -0.019860859 1.846387
print(ro.r('head(transformed_df)')) # In R, tranformed_df adjusts column names appropriately
# Transformed response and predictor
print(ro.r('ncol(transformed_df)'))
log_Open log_Low log_volume_vwap log_volume_nvi log_volatility_bbm
1 1.862065 1.857133 1.863001 6.907755 1.864778
2 1.867531 1.861221 1.864996 6.907755 1.865642
3 1.866506 1.849428 1.862269 6.891721 1.860611
4 1.854162 1.841329 1.859182 6.889871 1.857627
5 1.847291 1.841377 1.857707 6.896497 1.857152
6 1.859108 1.838455 1.856286 6.896497 1.855366
log_volatility_bbl log_volatility_kcc log_volatility_kch log_volatility_kcl
1 1.864778 1.863001 1.872876 1.853028
2 1.863914 1.864817 1.875183 1.854343
3 1.846170 1.862163 1.876028 1.848104
4 1.841300 1.858752 1.872629 1.844680
5 1.842428 1.857142 1.871011 1.843078
6 1.839668 1.855680 1.870796 1.840332
log_volatility_dcl log_volatility_dcm log_volatility_dcw log_volatility_atr
1 1.857133 1.862112 -0.009438764 -6.907755
2 1.857133 1.864662 0.404541909 -6.907755
3 1.849428 1.860846 0.820222064 -6.907755
4 1.841329 1.856850 1.124232772 -6.907755
5 1.841329 1.856850 1.124708244 -6.907755
6 1.838455 1.855436 1.214267211 -6.907755
log_trend_sma_fast log_trend_sma_slow log_trend_ema_fast log_trend_ema_slow
1 1.864778 1.864778 1.864778 1.864778
2 1.865642 1.865642 1.865044 1.864907
3 1.860611 1.860611 1.862816 1.863844
4 1.857627 1.857627 1.860645 1.862725
5 1.857152 1.857152 1.859817 1.862173
6 1.855366 1.855366 1.857762 1.861012
log_trend_ichimoku_conv log_trend_ichimoku_base log_trend_ichimoku_a
1 1.862112 1.862112 1.862112
2 1.864662 1.864662 1.864662
3 1.860846 1.860846 1.860846
4 1.856850 1.856850 1.856850
5 1.856850 1.856850 1.856850
6 1.855436 1.855436 1.855436
log_trend_ichimoku_b log_trend_visual_ichimoku_a log_trend_visual_ichimoku_b
1 1.862112 4.045036 4.02968
2 1.864662 4.045036 4.02968
3 1.860846 4.045036 4.02968
4 1.856850 4.045036 4.02968
5 1.856850 4.045036 4.02968
6 1.855436 4.045036 4.02968
log_trend_psar_up log_trend_psar_down log_momentum_kama log_High log_Volume
1 1.817366 1.867065 1.864778 1.867065 20.01750
2 1.817366 1.867065 1.865576 1.872134 20.21561
3 1.817366 1.867065 1.858742 1.870463 20.12935
4 1.817366 1.872134 1.854111 1.855342 19.98330
5 1.817366 1.870920 1.854622 1.855342 19.91943
6 1.817366 1.869753 1.850935 1.860048 19.95157
log_volatility_bbh log_volatility_dch log_volatility_kcw cube_volume_adi
1 1.864778 1.867065 0.6854856 1.871444e+25
2 1.867368 1.872134 0.7342592 1.469203e+25
3 1.874846 1.872134 1.0268348 -1.619565e+25
4 1.873692 1.872134 1.0277271 -1.301433e+25
5 1.871662 1.872134 1.0271463 8.779657e+24
6 1.870820 1.872134 1.1138761 5.382591e+23
cube_volume_obv cube_trend_kst_sig cube_trend_kst cube_momentum_ao
1 1.203559e+26 -700379993 -700379993 1.000000e-09
2 1.315216e+27 -700265544 -700151107 1.000000e-09
3 1.605230e+26 -700670855 -701481946 1.000000e-09
4 2.920043e+23 -701070131 -702268869 1.000000e-09
5 1.357603e+26 -701334790 -702394096 1.000000e-09
6 1.383842e+23 -701589521 -702864098 -3.508128e-07
volume_vpt momentum_ppo momentum_ppo_signal volatility_bbw
1 0.001 0.00100000 0.001000000 0.0010000
2 1040520.766 0.01478854 0.003757708 0.3464441
3 -7742214.017 -0.10176437 -0.017346708 2.8684042
4 -8624297.733 -0.20670532 -0.055218431 3.2398640
5 -5648500.127 -0.23430543 -0.091035831 2.9241717
6 -9725960.320 -0.32341803 -0.137512270 3.1159406
trend_macd_signal trend_macd log_Close
1 0.0010000000 0.001000000 1.864778
2 0.0011780192 0.001890096 1.866506
3 -0.0001829326 -0.005626740 1.850472
4 -0.0026221153 -0.012378846 1.848621
5 -0.0049273480 -0.014148279 1.855247
6 -0.0079140501 -0.019860859 1.846387
[1] 43
# Retrieve the test_data from the R environment with adjustments
test_data = pandas2ri.rpy2py(ro.globalenv['test_data'])
# Function to apply the same transformations to the test set
def transform_test_set(test_data):
# Create a copy of test_data to avoid modifying the original test data
transformed_test_df = test_data.copy()
# Get the column names from the training set (transformed_df)
colnames = transformed_df.columns # This includes the transformed names
# Apply transformations to the test set based on training set transformations
for colname in test_data.columns:
colname_str = str(colname)
# Check if the column was log-transformed in the training set
if f"log_{colname_str}" in colnames:
# Apply log transformation and rename the column
transformed_test_df[f"log_{colname_str}"] = np.log(transformed_test_df[colname_str])
transformed_test_df.drop(columns=[colname_str], inplace=True) # Drop the original column
# Check if the column was cube root-transformed in the training set
elif f"cube_{colname_str}" in colnames:
# Apply cube root transformation and rename the column
transformed_test_df[f"cube_{colname_str}"] = transformed_test_df[colname_str].apply(lambda x: np.power(x, 3))
transformed_test_df.drop(columns=[colname_str], inplace=True) # Drop the original column
# Return the transformed test set
return transformed_test_df
# Apply the function to transform the test set
transformed_test_df = transform_test_set(test_data)
print(transformed_test_df.head())
# Now the test set (transformed_test_df) has the same transformations and column names as the training set (transformed_df)
Date Dividends Stock Splits volume_cmf volume_fi volume_em \ 1 18631.0 0.0 0.0 -0.014691 -1.765197e+07 -13.948546 2 18632.0 0.0 0.0 0.007172 6.722631e+06 -0.324804 3 18633.0 0.0 0.0 -0.066527 -8.988428e+07 -3.953301 4 18634.0 0.0 0.0 -0.048975 -1.084356e+07 3.395838 5 18635.0 0.0 0.0 0.009339 7.323769e+06 3.685345 volume_sma_em volume_vpt volume_mfi volatility_bbw ... \ 1 1.074958 2.537085e+08 54.913227 14.260615 ... 2 0.928156 2.549160e+08 55.549845 13.712819 ... 3 0.059892 2.496955e+08 46.364228 13.352539 ... 4 0.119525 2.534347e+08 46.789745 13.044938 ... 5 0.232506 2.543423e+08 47.199189 12.093633 ... log_trend_ichimoku_conv log_trend_ichimoku_base log_trend_ichimoku_a \ 1 4.854819 4.822735 4.838906 2 4.867362 4.826861 4.847316 3 4.865930 4.829172 4.847720 4 4.865930 4.841614 4.853846 5 4.865930 4.842155 4.854113 log_trend_ichimoku_b log_trend_visual_ichimoku_a \ 1 4.790586 4.729501 2 4.790586 4.729501 3 4.790586 4.729501 4 4.790586 4.729458 5 4.790586 4.738023 log_trend_visual_ichimoku_b log_trend_psar_up log_trend_psar_down \ 1 4.715327 4.815739 4.911668 2 4.715327 4.815739 4.909932 3 4.715327 4.815739 4.908229 4 4.715327 4.815739 4.904772 5 4.715327 4.815739 4.901442 cube_momentum_ao log_momentum_kama 1 890.597161 4.843903 2 561.489737 4.844271 3 268.292663 4.843998 4 140.170349 4.844071 5 90.018258 4.844221 [5 rows x 93 columns]
# Convert final, transformed test df to an R DataFrame
transformed_test_df_r = pandas2ri.py2rpy(transformed_test_df)
# Assign the R dataframe to a variable in R
ro.globalenv['transformed_test_df'] = transformed_test_df_r
ro.r('''
transformed_test_df = as.data.table(transformed_test_df)
print(head(transformed_test_df))
''')
Date Dividends Stock Splits volume_cmf Date Dividends Stock Splits volume_cmf volume_fi volume_em volume_sma_em
<num> <num> <num> <num> <num> <num> <num>
1: 18631 0 0 -0.014690928 -17651971 -13.948546 1.07495835
2: 18632 0 0 0.007171973 6722631 -0.324804 0.92815558
3: 18633 0 0 -0.066527463 -89884281 -3.953301 0.05989194
4: 18634 0 0 -0.048975208 -10843558 3.395838 0.11952482
5: 18635 0 0 0.009338842 7323769 3.685345 0.23250602
6: 18638 0 0 -0.032781904 -36820671 -3.339920 0.12107051
volume_vpt volume_mfi volatility_bbw volatility_bbp volatility_bbhi
<num> <num> <num> <num> <num>
1: 253708522 54.91323 14.26061 0.5743106 0
2: 254916022 55.54984 13.71282 0.6429638 0
3: 249695536 46.36423 13.35254 0.3816069 0
4: 253434671 46.78974 13.04494 0.6164770 0
5: 254342335 47.19919 12.09363 0.6645151 0
6: 252008519 49.81354 11.24627 0.4464819 0
volatility_bbli volatility_kcp volatility_kchi volatility_kcli
<num> <num> <num> <num>
1: 0 0.1707271 0 0
2: 0 0.3447878 0 0
3: 0 -0.2621132 0 1
4: 0 0.3342434 0 0
5: 0 0.4859513 0 0
6: 0 0.1063101 0 0
volatility_dcp volatility_ui trend_macd trend_macd_signal trend_macd_diff
<num> <num> <num> <num> <num>
1: 0.4967812 1.907276 3.389452 3.349603 0.03984921
2: 0.5826176 2.135172 3.166302 3.312943 -0.14664044
3: 0.3460300 2.907064 2.611005 3.172555 -0.56155020
4: 0.5777884 3.118267 2.483540 3.034752 -0.55121204
5: 0.6384121 3.247561 2.443617 2.916525 -0.47290795
6: 0.4621708 3.555237 2.144751 2.762170 -0.61741960
trend_vortex_ind_pos trend_vortex_ind_neg trend_vortex_ind_diff trend_trix
<num> <num> <num> <num>
1: 1.0102009 0.7653968 0.24480407 0.4093338
2: 1.0226346 0.8179947 0.20463998 0.4066834
3: 0.9792360 0.9288597 0.05037637 0.3925853
4: 0.9404317 0.9082032 0.03222857 0.3775901
5: 0.9611539 0.9171152 0.04403875 0.3636756
6: 0.9337240 0.9197245 0.01399951 0.3455297
trend_mass_index trend_dpo trend_kst_diff trend_stc trend_adx trend_adx_pos
<num> <num> <num> <num> <num> <num>
1: 25.66873 -0.2378792 10.2343263 49.999735 24.30929 29.41934
2: 25.94140 0.2046043 7.5640350 24.999868 23.09144 27.45868
3: 26.08026 -1.9319096 0.6388617 12.499934 21.53552 24.93386
4: 26.13021 -0.7150917 -6.1809446 6.249967 20.07573 23.64207
5: 26.12101 2.3553242 -9.6677172 3.124983 19.01872 24.49260
6: 26.09528 1.1737675 -13.6861689 1.562492 17.83205 22.78034
trend_adx_neg trend_cci trend_aroon_up trend_aroon_down trend_aroon_ind
<num> <num> <num> <num> <num>
1: 25.43710 30.45343 88 0 88
2: 23.74184 32.78466 84 0 84
3: 25.59509 -12.14355 80 0 80
4: 23.12832 21.68439 76 0 76
5: 22.03699 44.39186 72 24 48
6: 23.90323 -12.27294 68 20 48
trend_psar_up_indicator trend_psar_down_indicator momentum_rsi
<num> <num> <num>
1: 0 1 54.95762
2: 0 0 57.72290
3: 0 0 48.82539
4: 0 0 55.98301
5: 0 0 57.65158
6: 0 0 51.89596
momentum_stoch_rsi momentum_stoch_rsi_k momentum_stoch_rsi_d momentum_tsi
<num> <num> <num> <num>
1: 0.0000000 0.32577450 0.5522281 28.10394
2: 0.1479567 0.18788292 0.3605759 26.12204
3: 0.0000000 0.04931889 0.1876588 21.24648
4: 0.2883579 0.14543818 0.1275467 19.57976
5: 0.3555791 0.21464567 0.1364676 18.84834
6: 0.1237036 0.25588021 0.2053214 16.25136
momentum_uo momentum_stoch momentum_stoch_signal momentum_wr momentum_roc
<num> <num> <num> <num> <num>
1: 45.54722 45.62326 61.66026 -54.37674 1.196447
2: 50.35984 49.28283 53.82103 -50.71717 2.503702
3: 43.73471 20.53449 38.48019 -79.46551 -1.631708
4: 44.27394 48.69602 39.50445 -51.30398 3.363302
5: 48.97105 56.06256 41.76436 -43.93744 2.979005
6: 48.00849 36.04948 46.93602 -63.95052 -2.198985
momentum_ppo momentum_ppo_signal momentum_ppo_hist momentum_pvo
<num> <num> <num> <num>
1: 2.722227 2.724815 -0.002587856 2.357684
2: 2.537360 2.687324 -0.149963761 1.044884
3: 2.093427 2.568545 -0.475117600 4.028895
4: 1.987172 2.452270 -0.465097806 3.150377
5: 1.950271 2.351870 -0.401599652 2.117547
6: 1.710766 2.223649 -0.512883733 0.940333
momentum_pvo_signal momentum_pvo_hist others_dr others_dlr log_Open
<num> <num> <num> <num> <num>
1: 1.149709 1.20797511 -2.4719204 -2.5029853 4.872957
2: 1.128744 -0.08385972 1.2363705 1.2287898 4.837665
3: 1.708774 2.32012043 -3.3661441 -3.4241031 4.828546
4: 1.997095 1.15328231 3.4122976 3.3553701 4.833544
5: 2.021185 0.09636208 0.8631419 0.8594382 4.864760
6: 1.805015 -0.86468179 -2.3248771 -2.3523287 4.839990
log_High log_Low log_Close log_Volume cube_volume_adi cube_volume_obv
<num> <num> <num> <num> <num> <num>
1: 4.873631 4.821001 4.841691 18.78046 1.640976e+31 5.838679e+30
2: 4.859536 4.834090 4.853979 18.39705 1.651574e+31 5.934197e+30
3: 4.854285 4.817999 4.819738 18.85950 1.624393e+31 5.782999e+30
4: 4.858700 4.829641 4.853292 18.51215 1.637572e+31 5.889559e+30
5: 4.866269 4.848008 4.861886 18.47098 1.648105e+31 5.993045e+30
6: 4.847547 4.834635 4.838363 18.42452 1.639827e+31 5.894231e+30
log_volume_vwap log_volume_nvi log_volatility_bbm log_volatility_bbh
<num> <num> <num> <num>
1: 4.847742 10.45431 4.831150 4.900026
2: 4.850752 10.46660 4.834565 4.900880
3: 4.851776 10.46660 4.835673 4.900302
4: 4.852897 10.50015 4.838212 4.901398
5: 4.854164 10.50875 4.842186 4.900896
6: 4.856750 10.48522 4.844400 4.899107
log_volatility_bbl log_volatility_kcc log_volatility_kch log_volatility_kcl
<num> <num> <num> <num>
1: 4.757177 4.860520 4.888449 4.831787
2: 4.763537 4.862868 4.890976 4.833947
3: 4.766577 4.863886 4.891826 4.835143
4: 4.770763 4.862492 4.889742 4.834479
5: 4.779812 4.862679 4.890475 4.834087
6: 4.786526 4.860436 4.887786 4.832318
log_volatility_kcw log_volatility_dcl log_volatility_dch log_volatility_dcm
<num> <num> <num> <num>
1: 1.734252 4.767447 4.911668 4.842155
2: 1.740700 4.767447 4.911668 4.842155
3: 1.734627 4.767447 4.911668 4.842155
4: 1.709259 4.767447 4.911668 4.842155
5: 1.729412 4.767447 4.911668 4.842155
6: 1.712958 4.770770 4.911668 4.843698
log_volatility_dcw log_volatility_atr log_trend_sma_fast log_trend_sma_slow
<num> <num> <num> <num>
1: 2.678034 1.288087 4.854914 4.816656
2: 2.674620 1.277396 4.856945 4.821211
3: 2.673511 1.304466 4.855613 4.824243
4: 2.670972 1.337510 4.858314 4.827826
5: 2.666998 1.298429 4.860731 4.830634
6: 2.643091 1.293275 4.858897 4.832405
log_trend_ema_fast log_trend_ema_slow cube_trend_kst cube_trend_kst_sig
<num> <num> <num> <num>
1: 4.851246 4.824388 1067665.6 777981.1
2: 4.851667 4.826610 1095295.1 871437.9
3: 4.846821 4.826103 959623.9 941097.9
4: 4.847819 4.828142 805080.7 976452.1
5: 4.849996 4.830681 724592.9 984658.8
6: 4.848215 4.831252 602680.8 945661.0
log_trend_ichimoku_conv log_trend_ichimoku_base log_trend_ichimoku_a
<num> <num> <num>
1: 4.854819 4.822735 4.838906
2: 4.867362 4.826861 4.847316
3: 4.865930 4.829172 4.847720
4: 4.865930 4.841614 4.853846
5: 4.865930 4.842155 4.854113
6: 4.865930 4.842155 4.854113
log_trend_ichimoku_b log_trend_visual_ichimoku_a log_trend_visual_ichimoku_b
<num> <num> <num>
1: 4.790586 4.729501 4.715327
2: 4.790586 4.729501 4.715327
3: 4.790586 4.729501 4.715327
4: 4.790586 4.729458 4.715327
5: 4.790586 4.738023 4.715327
6: 4.790586 4.738023 4.715327
log_trend_psar_up log_trend_psar_down cube_momentum_ao log_momentum_kama
<num> <num> <num> <num>
1: 4.815739 4.911668 890.59716 4.843903
2: 4.815739 4.909932 561.48974 4.844271
3: 4.815739 4.908229 268.29266 4.843998
4: 4.815739 4.904772 140.17035 4.844071
5: 4.815739 4.901442 90.01826 4.844221
6: 4.815739 4.898235 63.38090 4.844119
# Ensure transformed_test_df_clean only contains columns present in transformed_df
def align_columns(df_source, df_target):
# Find common columns between source and target
common_columns = df_source.columns.intersection(df_target.columns)
# Align target dataframe to source dataframe's common columns
df_target_aligned = df_target[common_columns]
return df_target_aligned
# Apply the alignment to both training and test sets
transformed_test_df_clean = align_columns(transformed_df, transformed_test_df)
# Print the first few rows of the aligned test dataframe
print(transformed_test_df_clean.head())
# Check the new shapes of the aligned dataframes
print("Training set number of columns:", transformed_df.shape[1])
print("Test set number of aligned columns:", transformed_test_df_clean.shape[1])
log_Open log_Low log_volume_vwap log_volume_nvi log_volatility_bbm \ 1 4.872957 4.821001 4.847742 10.454311 4.831150 2 4.837665 4.834090 4.850752 10.466599 4.834565 3 4.828546 4.817999 4.851776 10.466599 4.835673 4 4.833544 4.829641 4.852897 10.500153 4.838212 5 4.864760 4.848008 4.854164 10.508747 4.842186 log_volatility_bbl log_volatility_kcc log_volatility_kch \ 1 4.757177 4.860520 4.888449 2 4.763537 4.862868 4.890976 3 4.766577 4.863886 4.891826 4 4.770763 4.862492 4.889742 5 4.779812 4.862679 4.890475 log_volatility_kcl log_volatility_dcl ... cube_trend_kst_sig \ 1 4.831787 4.767447 ... 777981.057474 2 4.833947 4.767447 ... 871437.934350 3 4.835143 4.767447 ... 941097.943596 4 4.834479 4.767447 ... 976452.089280 5 4.834087 4.767447 ... 984658.768397 cube_trend_kst cube_momentum_ao volume_vpt momentum_ppo \ 1 1.067666e+06 890.597161 2.537085e+08 2.722227 2 1.095295e+06 561.489737 2.549160e+08 2.537360 3 9.596239e+05 268.292663 2.496955e+08 2.093427 4 8.050807e+05 140.170349 2.534347e+08 1.987172 5 7.245929e+05 90.018258 2.543423e+08 1.950271 momentum_ppo_signal volatility_bbw trend_macd_signal trend_macd \ 1 2.724815 14.260615 3.349603 3.389452 2 2.687324 13.712819 3.312943 3.166302 3 2.568545 13.352539 3.172555 2.611005 4 2.452270 13.044938 3.034752 2.483540 5 2.351870 12.093633 2.916525 2.443617 log_Close 1 4.841691 2 4.853979 3 4.819738 4 4.853292 5 4.861886 [5 rows x 43 columns] Training set number of columns: 43 Test set number of aligned columns: 43
# This has everything as desired, all predictors and the response based on correlation filtering
print(ro.r('ncol(transformed_df)'))
# This does not yet
print(ro.r('ncol(transformed_test_df)'))
# Transform the R dataframe to reflect the changes made in Python
ro.globalenv['transformed_test_df'] = transformed_test_df_clean
print(ro.r('ncol(transformed_test_df)'))
# Problem has been resolved
[1] 43 [1] 93 [1] 43
ro.r(''' # Fit a linear model with all predictors
# Fit a linear model with all predictors
full_model <- lm(log_Close ~ ., data = transformed_df)
# Grab summary statistics
print(summary(full_model))
''')
ro.r('''
set.seed(123)
# Exponentiate the values in the test set to get them back on the real scale
y_test <- transformed_test_df$log_Close
exp_y_test <- exp(transformed_test_df$log_Close)
# Use the full model to predict on the test data
predictions_full_test <- predict(full_model, newdata = transformed_test_df)
# Exponentiate the predicted values to get them on the real scale
predictions_full_test_exp <- exp(predictions_full_test)
# Apply smearing correction to account for bias
residuals_full <- y_test - predictions_full_test # Residuals on the log scale
correction_factor <- mean(exp(residuals_full)) # Smearing correction factor
predictions_full_test_exp_smearing <- predictions_full_test_exp * correction_factor
# Calculate Full Model MSPE on the log scale (without exponentiating)
full_model_mspe_log <- mean((y_test - predictions_full_test)^2)
# Calculate Full Model MSPE on the real scale (after smearing correction)
full_model_mspe_real <- mean((exp_y_test - predictions_full_test_exp_smearing)^2)
# Print the results
cat("Full Model MSPE (Log Scale):", full_model_mspe_log, "\n")
cat("Full Model MSPE (Real Scale, Smearing Corrected):", full_model_mspe_real, "\n")
''')
Call:
lm(formula = log_Close ~ ., data = transformed_df)
Residuals:
lm(formula = log_Close ~ ., data = transformed_df)
Residuals:
Min 1Q Median 3Q Max
-0.050590 -0.003017 0.000152 0.003075 0.072946
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.434e-02 1.166e-02 2.945 0.003254 **
log_Open -5.058e-01 1.495e-02 -33.836 < 2e-16 ***
log_Low 4.762e-01 1.562e-02 30.479 < 2e-16 ***
log_volume_vwap 5.835e-02 3.488e-02 1.673 0.094502 .
log_volume_nvi 3.025e-03 1.035e-03 2.922 0.003504 **
log_volatility_bbm 2.079e+00 1.939e+00 1.072 0.283850
log_volatility_bbl 4.262e+00 4.721e+00 0.903 0.366738
log_volatility_kcc -1.979e+01 6.104e+00 -3.243 0.001199 **
log_volatility_kch 1.027e+01 3.210e+00 3.200 0.001392 **
log_volatility_kcl 9.314e+00 2.892e+00 3.220 0.001296 **
log_volatility_dcl -1.321e+00 3.563e-01 -3.709 0.000213 ***
log_volatility_dcm 2.949e+00 8.228e-01 3.584 0.000344 ***
log_volatility_dcw 1.072e-02 3.837e-03 2.794 0.005238 **
log_volatility_atr -9.894e-04 6.870e-04 -1.440 0.149911
log_trend_sma_fast 8.287e-01 6.288e-02 13.179 < 2e-16 ***
log_trend_sma_slow 3.583e-01 4.565e-02 7.848 6.02e-15 ***
log_trend_ema_fast -3.313e+00 7.044e-01 -4.704 2.68e-06 ***
log_trend_ema_slow 1.837e+00 6.695e-01 2.744 0.006103 **
log_trend_ichimoku_conv -7.355e-01 4.102e-01 -1.793 0.073096 .
log_trend_ichimoku_base -6.672e-01 4.040e-01 -1.652 0.098711 .
log_trend_ichimoku_a 1.430e+00 8.175e-01 1.749 0.080375 .
log_trend_ichimoku_b -8.325e-03 7.214e-03 -1.154 0.248617
log_trend_visual_ichimoku_a 2.329e-02 4.131e-03 5.637 1.90e-08 ***
log_trend_visual_ichimoku_b -2.241e-02 3.620e-03 -6.191 6.90e-10 ***
log_trend_psar_up 1.531e-02 5.226e-03 2.929 0.003426 **
log_trend_psar_down 9.675e-03 4.757e-03 2.034 0.042034 *
log_momentum_kama 2.559e-02 1.216e-02 2.105 0.035404 *
log_High 7.938e-01 1.985e-02 39.995 < 2e-16 ***
log_Volume -2.771e-03 4.046e-04 -6.849 9.17e-12 ***
log_volatility_bbh -5.648e+00 6.614e+00 -0.854 0.393188
log_volatility_dch -1.723e+00 4.709e-01 -3.660 0.000257 ***
log_volatility_kcw -9.417e-03 3.899e-03 -2.415 0.015789 *
cube_volume_adi 1.009e-34 7.981e-35 1.265 0.206126
cube_volume_obv 3.873e-34 1.416e-34 2.735 0.006275 **
cube_trend_kst_sig 2.519e-11 2.180e-11 1.155 0.248141
cube_trend_kst -3.276e-11 1.892e-11 -1.731 0.083589 .
cube_momentum_ao -1.580e-06 6.583e-07 -2.401 0.016429 *
volume_vpt 1.191e-11 2.933e-12 4.061 5.03e-05 ***
momentum_ppo 5.947e-02 7.660e-03 7.764 1.15e-14 ***
momentum_ppo_signal -3.251e-02 2.263e-03 -14.364 < 2e-16 ***
volatility_bbw 4.931e-02 5.629e-02 0.876 0.381084
trend_macd_signal 4.185e-04 8.362e-04 0.500 0.616773
trend_macd 1.335e-03 8.519e-04 1.567 0.117255
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.005919 on 2726 degrees of freedom
Multiple R-squared: 0.9999, Adjusted R-squared: 0.9999
F-statistic: 9.077e+05 on 42 and 2726 DF, p-value: < 2.2e-16
Full Model MSPE (Log Scale): 4.227034e-05
Full Model MSPE (Real Scale, Smearing Corrected): 0.9547482
ro.r(''' # Calculate VIF for all predictors
# VIF calculation function is in the 'car' package
library(car)
# Calculate VIF for the full model
vif_values <- vif(full_model)
# Identify variables with high VIF
high_vif <- names(vif_values[vif_values > 10])
# Identify variables with low VIF
low_vif <- names(vif_values[vif_values <= 10])
# Display results
cat("Variables with high VIF:\n")
print(high_vif)
cat("Variables with low VIF:\n")
print(low_vif)
''')
Variables with high VIF: [1] [1] "log_Open" "log_Low" [3] "log_volume_vwap" "log_volume_nvi" [5] "log_volatility_bbm" "log_volatility_bbl" [7] "log_volatility_kcc" "log_volatility_kch" [9] "log_volatility_kcl" "log_volatility_dcl" [11] "log_volatility_dcm" "log_volatility_dcw" [13] "log_volatility_atr" "log_trend_sma_fast" [15] "log_trend_sma_slow" "log_trend_ema_fast" [17] "log_trend_ema_slow" "log_trend_ichimoku_conv" [19] "log_trend_ichimoku_base" "log_trend_ichimoku_a" [21] "log_trend_ichimoku_b" "log_trend_visual_ichimoku_a" [23] "log_trend_visual_ichimoku_b" "log_trend_psar_up" [25] "log_trend_psar_down" "log_momentum_kama" [27] "log_High" "log_volatility_bbh" [29] "log_volatility_dch" "log_volatility_kcw" [31] "cube_trend_kst_sig" "cube_trend_kst" [33] "momentum_ppo" "momentum_ppo_signal" [35] "volatility_bbw" "trend_macd_signal" [37] "trend_macd" Variables with low VIF: [1] "log_Volume" "cube_volume_adi" "cube_volume_obv" "cube_momentum_ao" [5] "volume_vpt"
We will look at the model that results from VIF to see how it is.
ro.r(''' # Fit a linear model with VIF predictors
vif_model <- lm(log_Close ~ log_Volume + cube_volume_adi + cube_volume_obv + volume_vpt + cube_momentum_ao, data = transformed_df)
# Print summary statistics
print(summary(vif_model))
''')
Call:
lm(formula = log_Close ~ log_Volume + cube_volume_adi + cube_volume_obv +
volume_vpt + cube_momentum_ao, data = transformed_df)
lm(formula = log_Close ~ log_Volume + cube_volume_adi + cube_volume_obv +
volume_vpt + cube_momentum_ao, data = transformed_df)
Residuals:
Min 1Q Median 3Q Max
-0.94788 -0.15978 -0.00741 0.16428 0.77492
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.040e+01 1.460e-01 71.201 < 2e-16 ***
log_Volume -4.129e-01 7.399e-03 -55.803 < 2e-16 ***
cube_volume_adi 1.160e-31 1.861e-33 62.361 < 2e-16 ***
cube_volume_obv -1.301e-32 4.786e-33 -2.718 0.00661 **
volume_vpt -6.828e-11 9.765e-11 -0.699 0.48450
cube_momentum_ao 9.610e-05 1.474e-05 6.521 8.27e-11 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.2427 on 2763 degrees of freedom
Multiple R-squared: 0.8781, Adjusted R-squared: 0.8779
F-statistic: 3981 on 5 and 2763 DF, p-value: < 2.2e-16
ro.r(''' # Get the actual test values in log scale
y_test <- transformed_test_df$log_Close
exp_y_test <- exp(transformed_test_df$log_Close)
# Predict on the test data using the VIF model
predictions_test <- predict(vif_model, newdata = transformed_test_df)
# Exponentiate the predictions to get them on the real scale
predictions_test_exp <- exp(predictions_test)
# Calculate MSPE for the log scale (no need to exponentiate)
vif_mspe <- mean((y_test - predictions_test)^2)
# Calculate MSPE for the real scale
vif_real_mspe <- mean((exp_y_test - predictions_test_exp)^2)
# Print the results
cat("VIF MSPE (Log Scale):", vif_mspe, "\n")
cat("Full Model MSPE (Log Scale):", full_model_mspe_log, "\n")
''')
VIF MSPE (Log Scale): 0.08338699 0.08338699 Full Model MSPE (Log Scale): 4.227034e-05
We evaluated two regression models for predicting financial data: the full model (with all available transformed and untransformed predictors based on correlation filtering) and a simplified VIF model (with predictors filtered using correlation, histogram analysis, and VIF elimination). Both models were trained on log-transformed data and tested on unseen data to assess their generalization abilities.
0.08344.23e-05The full model significantly outperformed the VIF model, especially on the real scale. This is likely due to the full model’s ability to capture the inherent complexities and subtle relationships present in financial data. While the VIF model was simplified to reduce multicollinearity, it likely missed important predictors, leading to higher prediction error on unseen data.
Now I will do regularized regression methods, Lasso, Ridge, and ElasticNet.
ro.r(''' # Ridge
# Load necessary libraries
library(glmnet)
library(caret)
# For reproducibility
set.seed(123)
# transformed_df uses the training data
# Convert predictors and response using model.matrix
x <- model.matrix(log_Close ~ . - 1, transformed_df) # Create design matrix, excluding intercept
y <- transformed_df$log_Close # The response variable
# Ridge Regression (alpha = 0)
ridge_model <- cv.glmnet(x, y, alpha = 0)
print("Ridge Regression:")
print(ridge_model)
# Optimal lambda for the Ridge model
best_lambda_ridge <- ridge_model$lambda.min
cat("Best lambda for Ridge:", best_lambda_ridge, "\\n")
''')
[1] "Ridge Regression:"
Call: cv.glmnet(x = x, y = y, alpha = 0) "Ridge Regression:"
Call: cv.glmnet(x = x, y = y, alpha = 0)
Measure: Mean-Squared Error
Lambda Index Measure SE Nonzero
min 0.1332 93 0.0004800 2.120e-05 42
1se 0.1462 92 0.0004955 2.134e-05 42
Best lambda for Ridge: 0.1331874
ro.r(''' # Ridge Coefficients
set.seed(123)
# Get Ridge coefficients
ridge_coefs <- coef(ridge_model, s = best_lambda_ridge)
print("Ridge Coefficients:")
print(ridge_coefs)
''')
[1] "Ridge Coefficients:"
"Ridge Coefficients:"
43 x 1 sparse Matrix of class "dgCMatrix"
s1
(Intercept) 4.637408e-01
log_Open 3.931113e-02
log_Low 3.972488e-02
log_volume_vwap 3.470217e-02
log_volume_nvi 3.052148e-02
log_volatility_bbm 3.436052e-02
log_volatility_bbl 3.458979e-02
log_volatility_kcc 3.474266e-02
log_volatility_kch 3.459766e-02
log_volatility_kcl 3.472208e-02
log_volatility_dcl 3.417655e-02
log_volatility_dcm 3.432027e-02
log_volatility_dcw 6.508523e-03
log_volatility_atr 1.703996e-02
log_trend_sma_fast 3.466929e-02
log_trend_sma_slow 3.444040e-02
log_trend_ema_fast 3.543411e-02
log_trend_ema_slow 3.475413e-02
log_trend_ichimoku_conv 3.467400e-02
log_trend_ichimoku_base 3.424567e-02
log_trend_ichimoku_a 3.432522e-02
log_trend_ichimoku_b 3.467557e-02
log_trend_visual_ichimoku_a 2.987032e-02
log_trend_visual_ichimoku_b 3.018383e-02
log_trend_psar_up 3.129366e-02
log_trend_psar_down 3.246034e-02
log_momentum_kama 3.522341e-02
log_High 3.840251e-02
log_Volume -2.206108e-02
log_volatility_bbh 3.342626e-02
log_volatility_dch 3.396499e-02
log_volatility_kcw -9.380103e-03
cube_volume_adi 2.643787e-33
cube_volume_obv 2.205903e-33
cube_trend_kst_sig 1.047330e-10
cube_trend_kst 4.059210e-11
cube_momentum_ao -2.272177e-06
volume_vpt 8.058241e-11
momentum_ppo 8.947563e-03
momentum_ppo_signal -4.405554e-04
volatility_bbw 8.738887e-05
trend_macd_signal -1.616066e-03
trend_macd 1.173095e-02
ro.r(''' # Lasso
set.seed(123)
# Lasso Regression (alpha = 1)
lasso_model <- cv.glmnet(x, y, alpha = 1)
print("Lasso Regression:")
print(lasso_model)
# Optimal lambda for the Ridge model
best_lambda_lasso <- lasso_model$lambda.min
cat("Best lambda for Lasso:", best_lambda_lasso, "\\n")
''')
[1] "Lasso Regression:"
Call: cv.glmnet(x = x, y = y, alpha = 1) "Lasso Regression:"
Call: cv.glmnet(x = x, y = y, alpha = 1)
Measure: Mean-Squared Error
Lambda Index Measure SE Nonzero
min 0.01681 41 0.0004593 1.081e-05 2
1se 0.01681 41 0.0004593 1.081e-05 2
Best lambda for Lasso: 0.01680634
ro.r(''' # Coefficients from the Lasso model
lasso_coefs <- coef(lasso_model, s = best_lambda_lasso)
print(lasso_coefs)
''')
43 x 1 sparse Matrix of class "dgCMatrix"
s1 s1
(Intercept) 0.07550758
log_Open 0.95551481
log_Low 0.02099481
log_volume_vwap .
log_volume_nvi .
log_volatility_bbm .
log_volatility_bbl .
log_volatility_kcc .
log_volatility_kch .
log_volatility_kcl .
log_volatility_dcl .
log_volatility_dcm .
log_volatility_dcw .
log_volatility_atr .
log_trend_sma_fast .
log_trend_sma_slow .
log_trend_ema_fast .
log_trend_ema_slow .
log_trend_ichimoku_conv .
log_trend_ichimoku_base .
log_trend_ichimoku_a .
log_trend_ichimoku_b .
log_trend_visual_ichimoku_a .
log_trend_visual_ichimoku_b .
log_trend_psar_up .
log_trend_psar_down .
log_momentum_kama .
log_High .
log_Volume .
log_volatility_bbh .
log_volatility_dch .
log_volatility_kcw .
cube_volume_adi .
cube_volume_obv .
cube_trend_kst_sig .
cube_trend_kst .
cube_momentum_ao .
volume_vpt .
momentum_ppo .
momentum_ppo_signal .
volatility_bbw .
trend_macd_signal .
trend_macd .
ro.r(''' # ElasticNet
# Set seed for reproducibility
set.seed(123)
x <- model.matrix(log_Close ~ . - 1, transformed_df) # Create design matrix, excluding intercept
y <- transformed_df$log_Close # The response variable
# Define the alpha values to loop over (e.g., from 0 to 1 in steps of 0.1)
alpha_values <- seq(0, 1, by = 0.1)
# Initialize variables to store the best results
best_alpha <- NULL
best_lambda <- NULL
lowest_mse <- Inf # Set initial MSE to infinity
# Loop through each alpha value
for (alpha_value in alpha_values) {
# Fit ElasticNet model for each alpha
elasticnet_model <- cv.glmnet(x, y, alpha = alpha_value)
# Get the best lambda for this alpha
best_lambda_for_alpha <- elasticnet_model$lambda.min
# Predict using the best lambda
predictions <- predict(elasticnet_model, s = best_lambda_for_alpha, newx = x)
# Calculate MSE
mse <- mean((y - predictions)^2)
# Output the appropriate message for each model type
if (alpha_value == 0){
cat("Ridge Regression: MSE:", mse, " | Best Lambda:", best_lambda_for_alpha, "\n")
} else if (alpha_value == 1){
cat("Lasso Regression: MSE:", mse, " | Best Lambda:", best_lambda_for_alpha, "\n")
} else {
cat("Alpha:", alpha_value, " | MSE:", mse, " | Best Lambda:", best_lambda_for_alpha, "\n")
}
# Update the best alpha and lambda if this model has the lowest MSE
if (mse < lowest_mse) {
best_alpha <- alpha_value
best_lambda <- best_lambda_for_alpha
lowest_mse <- mse
}
}
# Print the best alpha, lambda, and corresponding MSE
cat("\nBest Alpha:", best_alpha, "\n")
cat("Best Lambda:", best_lambda, "\n")
cat("Lowest MSE:", lowest_mse, "\n")
''')
Ridge Regression: MSE: 0.0004706209 | Best Lambda: 0.0004706209 | Best Lambda: 0.1331874 Alpha: 0.1 | MSE: 0.0004698214 | Best Lambda: 0.04568951 Alpha: 0.2 | MSE: 0.0004800163 | Best Lambda: 0.03019942 Alpha: 0.3 | MSE: 0.00046378 | Best Lambda: 0.02209589 Alpha: 0.4 | MSE: 0.0004778316 | Best Lambda: 0.01996093 Alpha: 0.5 | MSE: 0.0004690193 | Best Lambda: 0.01752568 Alpha: 0.6 | MSE: 0.0004536481 | Best Lambda: 0.01602867 Alpha: 0.7 | MSE: 0.0004772724 | Best Lambda: 0.01654851 Alpha: 0.8 | MSE: 0.0004411106 | Best Lambda: 0.01589172 Alpha: 0.9 | MSE: 0.0004464977 | Best Lambda: 0.01701479 Lasso Regression: MSE: 0.0004580127 | Best Lambda: 0.01680634 Best Alpha: 0.8 Best Lambda: 0.01589172 Lowest MSE: 0.0004411106
ro.r(''' # Coefficients from the ElasticNet model
elasticnet_coefs <- coef(elasticnet_model, s = best_lambda_enet)
print(elasticnet_coefs)
''')
43 x 1 sparse Matrix of class "dgCMatrix"
s1 s1
(Intercept) 0.07550758
log_Open 0.95551481
log_Low 0.02099481
log_volume_vwap .
log_volume_nvi .
log_volatility_bbm .
log_volatility_bbl .
log_volatility_kcc .
log_volatility_kch .
log_volatility_kcl .
log_volatility_dcl .
log_volatility_dcm .
log_volatility_dcw .
log_volatility_atr .
log_trend_sma_fast .
log_trend_sma_slow .
log_trend_ema_fast .
log_trend_ema_slow .
log_trend_ichimoku_conv .
log_trend_ichimoku_base .
log_trend_ichimoku_a .
log_trend_ichimoku_b .
log_trend_visual_ichimoku_a .
log_trend_visual_ichimoku_b .
log_trend_psar_up .
log_trend_psar_down .
log_momentum_kama .
log_High .
log_Volume .
log_volatility_bbh .
log_volatility_dch .
log_volatility_kcw .
cube_volume_adi .
cube_volume_obv .
cube_trend_kst_sig .
cube_trend_kst .
cube_momentum_ao .
volume_vpt .
momentum_ppo .
momentum_ppo_signal .
volatility_bbw .
trend_macd_signal .
trend_macd .
ro.r(''' # MSPE
set.seed(123)
# Check and align test data to match training data columns
transformed_test_df <- transformed_test_df[, colnames(transformed_df), drop = FALSE]
# Compute MSPE on the test set
test_x <- model.matrix(log_Close ~ . - 1, transformed_test_df) # Prepare test data
test_y <- transformed_test_df$log_Close # True values from the test set
# Use best lambda and model (ElasticNet, Ridge, or Lasso)
elasticnet_predictions <- predict(elasticnet_model, s = best_lambda, newx = test_x)
ridge_predictions <- predict(ridge_model, s = best_lambda_ridge, newx = test_x)
lasso_predictions <- predict(lasso_model, s = best_lambda_lasso, newx = test_x)
vif_predictions <- predict(vif_model, newdata = transformed_test_df) # VIF model uses full test data
# Calculate MSPE for each method
elasticnet_mspe <- mean((test_y - elasticnet_predictions)^2)
ridge_mspe <- mean((test_y - ridge_predictions)^2)
lasso_mspe <- mean((test_y - lasso_predictions)^2)
vif_mspe <- mean((test_y - vif_predictions)^2)
# Output the results
cat("ElasticNet MSPE (Log Scale):", elasticnet_mspe, "\n")
cat("Ridge MSPE (Log Scale):", ridge_mspe, "\n")
cat("Lasso MSPE (Log Scale):", lasso_mspe, "\n")
cat("VIF MSPE (Log Scale):", vif_mspe, "\n")
cat("Full Model MSPE (Log Scale):", full_model_mspe_log, "\n")
library(MASS)
# Fit robust regression model
robust_model <- rlm(log_Close ~ . , data = transformed_df)
# Predict on the test data
robust_preds <- predict(robust_model, newdata = transformed_test_df)
robust_mspe <- mean((test_y - robust_preds)^2)
cat("Huber Regression MSPE (Log Scale):", robust_mspe, "\n")
# Fit the LTS regression model
lts_model <- lqs(log_Close ~ ., data = transformed_df)
lts_preds <- predict(lts_model, newdata = transformed_test_df)
lts_mspe <- mean((test_y - lts_preds)^2)
cat("Least Trimmed Squares Regression MSPE (Log Scale):", lts_mspe, "\n")
# Fit the quantile regression model at the median (tau = 0.5)
library(quantreg)
qr_model <- rq(log_Close ~ . , data = transformed_df, tau = 0.5)
qr_preds <- predict(qr_model, newdata = transformed_test_df)
qr_mspe <- mean((test_y - qr_preds)^2)
cat("Quantile Regression MSPE (Log Scale):", qr_mspe, "\n")
# Fit penalized quantile regression
library(rqPen)
pen_qr_model <- rq.fit.lasso(x = test_x, y = test_y, tau = 0.5)
pen_qr_preds <- test_x %*% pen_qr_model$coefficients
pen_qr_mspe <- mean((test_y - pen_qr_preds)^2)
cat("Penalized Quantile Regression MSPE (Log Scale):", pen_qr_mspe, "\n")
''')
ElasticNet MSPE (Log Scale): 0.002135133 0.002135133 Ridge MSPE (Log Scale): 0.000694228 Lasso MSPE (Log Scale): 0.002135133 VIF MSPE (Log Scale): 0.08338699 Full Model MSPE (Log Scale): 4.227034e-05 Huber Regression MSPE (Log Scale): 3.789848e-05 Least Trimmed Squares Regression MSPE (Log Scale): 0.0005061762 Quantile Regression MSPE (Log Scale): 3.602424e-05 Penalized Quantile Regression MSPE (Log Scale): 7.939873e-05
ro.r(''' # MSPE Real Scale using bias correction
set.seed(123)
# Check and align test data to match training data columns
transformed_test_df <- transformed_test_df[, colnames(transformed_df), drop = FALSE]
# Compute MSPE on the test set
test_x <- model.matrix(log_Close ~ . - 1, transformed_test_df) # Prepare test data
test_y <- transformed_test_df$log_Close # True values from the test set
# Smearing function to apply bias correction
smearing_bias_correction <- function(log_preds, log_true) {
residuals_log <- log_true - log_preds
correction_factor <- mean(exp(residuals_log)) # Smearing correction factor
return(correction_factor)
}
# Function to calculate MSPE on the real scale
calculate_mspe_original <- function(log_preds, log_true) {
pred_original <- exp(log_preds)
true_original <- exp(log_true)
# Apply smearing bias correction
correction_factor <- smearing_bias_correction(log_preds, log_true)
adjusted_preds <- pred_original * correction_factor
# Calculate MSPE on the original scale
mspe_original <- mean((true_original - adjusted_preds)^2)
return(mspe_original)
}
# ElasticNet predictions and MSPE
elasticnet_predictions <- predict(elasticnet_model, s = best_lambda, newx = test_x)
elasticnet_mspe_real <- calculate_mspe_original(elasticnet_predictions, test_y)
# Ridge predictions and MSPE
ridge_predictions <- predict(ridge_model, s = best_lambda_ridge, newx = test_x)
ridge_mspe_real <- calculate_mspe_original(ridge_predictions, test_y)
# Lasso predictions and MSPE
lasso_predictions <- predict(lasso_model, s = best_lambda_lasso, newx = test_x)
lasso_mspe_real <- calculate_mspe_original(lasso_predictions, test_y)
# VIF Model predictions and MSPE
vif_predictions <- predict(vif_model, newdata = transformed_test_df) # VIF model uses full test data
vif_mspe_real <- calculate_mspe_original(vif_predictions, test_y)
# Full Model MSPE (Real Scale)
full_model_mspe_real <- calculate_mspe_original(predictions_full_test, test_y)
# Robust Model MSPE
robust_preds <- predict(robust_model, newdata = transformed_test_df)
robust_mspe_real <- calculate_mspe_original(robust_preds, test_y)
# LTS Model MSPE
lts_preds <- predict(lts_model, newdata = transformed_test_df)
lts_mspe_real <- calculate_mspe_original(lts_preds, test_y)
# Quantile Regression Model MSPE
qr_preds <- predict(qr_model, newdata = transformed_test_df)
qr_mspe_real <- calculate_mspe_original(qr_preds, test_y)
# Penalized Quantile Regression Model MSPE
pen_qr_preds <- test_x %*% pen_qr_model$coefficients
pen_qr_mspe_real <- calculate_mspe_original(pen_qr_preds, test_y)
''')
ro.r(''' # Output the results (Real Scale)
cat("ElasticNet MSPE (Real Scale):", elasticnet_mspe_real, "\n")
cat("Ridge MSPE (Real Scale):", ridge_mspe_real, "\n")
cat("Lasso MSPE (Real Scale):", lasso_mspe_real, "\n")
cat("VIF MSPE (Real Scale):", vif_mspe_real, "\n")
cat("Full Model MSPE (Real Scale):", full_model_mspe_real, "\n")
cat("Huber Regression MSPE (Real Scale):", robust_mspe_real, "\n")
cat("Least Trimmed Squares MSPE (Real Scale):", lts_mspe_real, "\n")
cat("Quantile Regression MSPE (Real Scale):", qr_mspe_real, "\n")
cat("Penalized Quantile Regression MSPE (Real Scale):", pen_qr_mspe_real, "\n")
''')
ElasticNet MSPE (Real Scale): 4.930664 4.930664 Ridge MSPE (Real Scale): 16.37224 Lasso MSPE (Real Scale): 4.930664 VIF MSPE (Real Scale): 3076.543 Full Model MSPE (Real Scale): 0.9547482 Huber Regression MSPE (Real Scale): 0.85243 Least Trimmed Squares MSPE (Real Scale): 11.6136 Quantile Regression MSPE (Real Scale): 0.8047459 Penalized Quantile Regression MSPE (Real Scale): 1.758257
In my analysis, I applied robust regression methods—Quantile Regression (QR), Least Trimmed Squares (LTS), and Huber's method—and regularized regression methods—Lasso, Ridge, and Elastic Net regression models. My goal was to address outliers and multicollinearity, evaluating the predictive performance of these regression models.
| Model | MSPE (Log Scale) | MSPE (Original Scale) |
|---|---|---|
| Quantile Regression | 3.6021e-05 | 0.8046161 |
| Huber Regression | 3.789034e-05 | 0.8522115 |
| Penalized Quantile Regression | 7.939872e-05 | 1.758258 |
| Least Trimmed Squares Regression | 0.0007393806 | 15.15739 |
| Model | MSPE (Log Scale) | MSPE (Original Scale) |
|---|---|---|
| Ridge Regression | 0.0006943541 | 16.37199 |
| Lasso Regression | 0.002135133 | 4.930664 |
| Elastic Net Regression | 0.002135133 | 4.930664 |
| VIF Model | 0.08338693 | 3076.539 |
| Full Model | 4.225889e-05 | 0.9543641 |
Quantile Regression (QR) had the lowest MSPE on both the log and original scales, with 3.6021e-05 (Log) and 0.8046161 (Original), making it the top performer.
Huber Regression followed closely, with an MSPE of 3.789034e-05 (Log) and 0.8522115 (Original), indicating its robustness against outliers.
Penalized Quantile Regression showed a higher MSPE than QR and Huber, with 7.939872e-05 (Log) and 1.758258 (Original).
Least Trimmed Squares (LTS) Regression performed worst among robust methods, with 0.0007393806 (Log) and 15.15739 (Original).
Ridge Regression had a relatively low MSPE on the log scale (0.0006943541) but was less effective on the original scale (16.37199).
Lasso and Elastic Net Regression showed similar performance, with 0.002135133 (Log) and 4.930664 (Original).
The VIF Model performed the worst, with an MSPE of 0.08338693 (Log) and 3076.539 (Original), indicating suboptimal predictions.
The Full Model was the strongest regularized method, with 4.225889e-05 (Log) and 0.9543641 (Original).
Quantile Regression (QR) is effective at estimating the median relationship between the predictors and the response. It minimizes the influence of extreme values by focusing on the median rather than the mean, which explains its superior performance on both scales.
Huber Regression combines squared error for small residuals and absolute error for large residuals. This approach makes it resistant to outliers while preserving the efficiency of least squares for typical data points.
Regularized Regression (Ridge, Lasso, and Elastic Net) applies penalties to the regression coefficients. Ridge regression uses the L2 penalty to shrink coefficients of correlated predictors, while Lasso applies an L1 penalty to enforce sparsity. Elastic Net is a combination of both L1 and L2 penalties, balancing shrinkage and variable selection.
The VIF Model was built by removing predictors with high multicollinearity based on the variance inflation factor. However, this simplistic approach likely resulted in the removal of useful predictors, leading to poor predictive performance.
Financial data often contains extremes and high volatility, which can distort traditional models. Robust methods like Quantile and Huber regression are less influenced by outliers, making them well-suited to handle the irregularities in financial data. This is why these models performed better than regularized methods, which may be more sensitive to such fluctuations.
Following these analyses, the next logical step is to incorporate time-dependent patterns through Time Series Analysis (TSA). By applying ARIMA and GARCH models, we aim to capture autocorrelation and volatility clustering, respectively. This transition will allow us to account for the temporal structure in stock prices and improve model performance further.
While the regression models provided valuable insights, they may not fully capture temporal dependencies inherent in time series data. Time Series Analysis techniques are specifically designed to model and forecast data where observations are correlated over time.
Next Steps:
Exploratory Time Series Analysis:
Stationarity Assessment:
Modeling Temporal Dependencies:
Forecast Evaluation:
By incorporating Time Series Analysis techniques, I will better understand the dynamics of the data over time, leading to more accurate and robust forecasts.
Given the inherent autocorrelation in stock market data, the next step is to conduct Time Series Analysis (TSA) to capture the temporal patterns. By applying ARIMA and potentially integrating GARCH models, we aim to model both the time-dependent trends and volatility in the data. This should help alleviate the burden on other predictors, allowing them to better capture the underlying relationships.
Once TSA is incorporated, we will re-evaluate the performance of robust and regularized regression methods, now accounting for time-series noise. Following TSA, we plan to explore Principal Component Analysis (PCA) and tree-based models (e.g., Random Forests) to further address multicollinearity, improve feature selection, and enhance prediction accuracy.
This approach ensures a comprehensive model that leverages both temporal dynamics and feature-based insights for more robust predictive performance.
# Create a function to save R plots to a PNG file and display them in Python
from IPython.display import Image, display
def display_r_plot(filename):
display(Image(filename))
ro.r('''
# Convert the 'Close' column to a time series object
close_ts <- ts(aapl_r_df$Close, start = c(2009, 1), frequency = 252) # Assuming daily data with ~252 trading days per year
# Transform the response variable as we did previously from Exploratory Analysis
close_ts <- log(close_ts)
# Verify we have created a time series object
print(class(close_ts))
# Create a new column in the dataframe to hold log_close
aapl_r_df$log_Close <- log(aapl_r_df$Close)
# Check the basic structure of the time series data
print(summary(close_ts))
plot(close_ts, main = "Close Price Time Series", ylab = "Price", xlab = "Time")
''')
ro.r('''
# Decompose the time series into trend, seasonal, and residual components
decomposed_close <- decompose(close_ts)
# Plot the decomposition
png(filename = "decomposition_plot.png", width = 800, height = 600)
plot(decomposed_close)
dev.off()
''')
# Display the decomposition plot
display_r_plot("decomposition_plot.png")
ro.r('''
# Plot the 'Close' price
png(filename = "Close_price.png", width = 600, height = 600)
# Plot the close prices directly without storing it in a variable
plot(aapl_r_df$Date, aapl_r_df$log_Close, type = "l", col = "blue",
main = "Log Close Price Over Time", xlab = "Date", ylab = "Close Price")
# Close the PNG device
dev.off()
# Check for stationarity using the Augmented Dickey-Fuller (ADF) test
adf_test <- adf.test(aapl_r_df$log_Close)
print(adf_test)
''')
# Display the plot with the corresponding filename
display_r_plot("Close_price.png")
ro.r('''
# Apply differencing to make the series stationary
differenced_close <- diff(close_ts)
# Plot the differenced series
png(filename = "differenced_close.png", width = 800, height = 600)
plot(differenced_close, main="Differenced Log Close Price Time Series", ylab="Differenced Price", xlab="Time")
dev.off()
''')
# Display the plot
display_r_plot("differenced_close.png")
# Perform the ADF test on the differenced series
ro.r('''
adf_test_diff <- adf.test(differenced_close)
print(adf_test_diff)
''')
# ACF plot example
ro.r('''
# Open PNG device with a custom filename for ACF
png(filename = "acf_plot.png", width = 800, height = 600)
# Create ACF plot
acf(aapl_r_df$Close, main="ACF of Log Close Prices")
# Close the PNG device
dev.off()
''')
# Display the ACF plot
display_r_plot("acf_plot.png")
# PACF plot example
ro.r('''
# Open PNG device with a custom filename for PACF
png(filename = "pacf_plot.png", width = 800, height = 600)
# Create PACF plot
pacf(aapl_r_df$Close, main="PACF of Log Close Prices")
# Close the PNG device
dev.off()
''')
# Display the PACF plot
display_r_plot("pacf_plot.png")
[1] "ts" Min. 1st Qu. Median Mean 3rd Qu. Max. 1.757 2.832 3.370 3.591 4.488 5.285
Augmented Dickey-Fuller Test data: aapl_r_df$log_Close Dickey-Fuller = -2.5797, Lag order = 15, p-value = 0.3329 alternative hypothesis: stationary
Augmented Dickey-Fuller Test data: differenced_close Dickey-Fuller = -14.365, Lag order = 15, p-value = 0.01 alternative hypothesis: stationary
R[write to console]: In addition: R[write to console]: Warning message: R[write to console]: In adf.test(differenced_close) : R[write to console]: p-value smaller than printed p-value
Our ACF is slowly decreasing, while the PACF cuts off at lag 2, indicating the presence of an autoregressive (AR) process. We have also differenced the data once to achieve stationarity, which suggests an integration order of 1. Based on the structure of the ACF, where lag 1 appears almost 1, an additional moving average (MA) component might also be necessary to capture the full dynamics of the data.
ro.r(''' # ARIMA testing
# Set up ranges for AR and MA components
p_values <- c(1, 2) # Possible values for AR (from PACF analysis)
q_values <- c(0, 1, 2, 3) # Possible values for MA (from ACF analysis)
d <- 1 # The differencing order has already been determined
# Initialize variables to store the best model and criteria
best_aic <- Inf
best_bic <- Inf
best_model_aic <- NULL
best_model_bic <- NULL
# Loop over p and q values
for (p in p_values) {
for (q in q_values) {
# Try fitting the ARIMA model with current p, d, q
arima_model <- tryCatch({arima(differenced_close, order = c(p, d, q))}, error = function(e) NULL)
# Check if the model fitting was successful
if (!is.null(arima_model)) {
# Calculate AIC and BIC for the model
current_aic <- AIC(arima_model)
current_bic <- BIC(arima_model)
# Update the best model based on AIC
if (current_aic < best_aic) {
best_aic <- current_aic
best_model_aic <- arima_model
}
# Update the best model based on BIC
if (current_bic < best_bic) {
best_bic <- current_bic
best_model_bic <- arima_model
}
# Print the result for this model
cat("ARIMA(", p, ",", d, ",", q, "): AIC =", current_aic, ", BIC =", current_bic, "\n")
}
}
}
# Print the best models
cat("\nBest model based on AIC:\n")
print(best_model_aic)
cat("\nBest model based on BIC:\n")
print(best_model_bic)
''')
ARIMA( 1 , 1 , 1 , 0 ): AIC = -16903.05 , BIC = -16890.71
ARIMA( 1 , 1 , 1 ): AIC = -18368.09 , BIC = -18349.59
ARIMA( 1 , 1 , 2 ): AIC = -18366.18 , BIC = -18341.51
ARIMA( 1 , 1 , 3 ): AIC = -18367.07 , BIC = -18336.24
ARIMA( 2 , 1 , 0 ): AIC = -17295.14 , BIC = -17276.64
ARIMA( 2 , 1 , 1 ): AIC = -18366.1 , BIC = -18341.43
ARIMA( 2 , 1 , 2 ): AIC = -18364.15 , BIC = -18333.32
ARIMA( 2 , 1 , 3 ): AIC = -18363.01 , BIC = -18326.02
Best model based on AIC:
Call:
arima(x = differenced_close, order = c(p, d, q))
Coefficients:
ar1 ma1
-0.0431 -1.0000
s.e. 0.0168 0.0012
sigma^2 estimated as 0.0003154: log likelihood = 9187.05, aic = -18368.09
Best model based on BIC:
Call:
arima(x = differenced_close, order = c(p, d, q))
Coefficients:
ar1 ma1
-0.0431 -1.0000
s.e. 0.0168 0.0012
sigma^2 estimated as 0.0003154: log likelihood = 9187.05, aic = -18368.09
I conducted time series analysis (TSA) on the AAPL Close prices, focusing on selecting the best-fitting ARIMA model. My main goal was to model the temporal dependencies in the differenced and log-transformed series and identify the optimal model based on AIC and BIC criteria.
Log Transformation:
Differencing:
Based on the ACF and PACF plots, I tested several ARIMA(p, d, q) models with different AR (p) and MA (q) components. The differencing (d) was fixed at 1.
| ARIMA Model (p, d, q) | AIC | BIC |
|---|---|---|
| ARIMA(1, 1, 0) | -16903.04 | -16890.71 |
| ARIMA(1, 1, 1) | -18368.09 | -18349.59 |
| ARIMA(1, 1, 2) | -18366.17 | -18341.51 |
| ARIMA(1, 1, 3) | -18367.07 | -18336.24 |
| ARIMA(2, 1, 0) | -17295.14 | -17276.64 |
| ARIMA(2, 1, 1) | -18366.09 | -18341.43 |
| ARIMA(2, 1, 2) | -18364.15 | -18333.32 |
| ARIMA(2, 1, 3) | -18363.13 | -18326.14 |
This analysis provides a thorough statistical basis for the model selection, ensuring I choose the most appropriate time-series model for predictive purposes.
ro.r(''' # Extract residuals from the best ARIMA model
residuals_arima <- residuals(best_model_aic)
# Plot residuals
png(filename = "residuals_plot.png", width = 800, height = 600)
plot(residuals_arima, main="Residuals from ARIMA(1, 1, 1)", ylab="Residuals", xlab="Time")
dev.off()
''')
display_r_plot("residuals_plot.png")
ro.r('''
# Check for autocorrelation in the residuals using ACF
png(filename = "residuals_acf.png", width = 800, height = 600)
acf(residuals_arima, main="ACF of Residuals from ARIMA(1, 1, 1)")
dev.off()
''')
display_r_plot("residuals_acf.png")
ro.r(''' # Perform Ljung-Box test to check for any remaining autocorrelation
ljung_box_test <- Box.test(residuals_arima, lag=10, type="Ljung-Box")
print(ljung_box_test)
''')
Box-Ljung test Box-Ljung test data: residuals_arima X-squared = 44.529, df = 10, p-value = 2.645e-06
ro.r(''' # ARIMA residuals
arima_211 <- arima(differenced_close, order =c(2,1,1))
residuals_arima_211 <- residuals(arima_211)
# Plot residuals from ARIMA(2, 1, 1)
png(filename = "residuals_arima_211.png", width = 800, height = 600)
plot(residuals_arima_211, main="Residuals from ARIMA(2, 1, 1)", ylab="Residuals", xlab="Time")
dev.off()
''')
display_r_plot("residuals_arima_211.png")
# Check the ACF of the residuals
ro.r('''
png(filename = "acf_residuals_arima_211.png", width = 800, height = 600)
acf(residuals_arima_211, main="ACF of Residuals from ARIMA(2, 1, 1)")
dev.off()
''')
display_r_plot("acf_residuals_arima_211.png")
ro.r(''' # Perform the Ljung-Box test to check for remaining autocorrelation
ljung_box_test_211 <- Box.test(residuals_arima_211, lag=10, type="Ljung-Box")
print(ljung_box_test_211)
''')
Box-Ljung test Box-Ljung test data: residuals_arima_211 X-squared = 44.46, df = 10, p-value = 2.721e-06
ro.r(''' # ARIMA summary
# Print ARIMA(1,1,1) summary
print(summary(best_model_aic))
# Print ARIMA(2,1,1) summary to see if change solved issues
print(summary(arima_211))
''')
Call:
arima(x = differenced_close, order = c(p, d, q))
arima(x = differenced_close, order = c(p, d, q))
Coefficients:
ar1 ma1
-0.0431 -1.0000
s.e. 0.0168 0.0012
sigma^2 estimated as 0.0003154: log likelihood = 9187.05, aic = -18368.09
Training set error measures:
ME RMSE MAE MPE MAPE MASE
Training set -5.812154e-05 0.01775744 0.01253686 NaN Inf 0.6925595
ACF1
Training set -0.0001845959
Call:
arima(x = differenced_close, order = c(2, 1, 1))
Coefficients:
ar1 ar2 ma1
-0.0430 0.0012 -1.0000
s.e. 0.0169 0.0169 0.0012
sigma^2 estimated as 0.0003154: log likelihood = 9187.05, aic = -18366.1
Training set error measures:
ME RMSE MAE MPE MAPE MASE
Training set -5.801953e-05 0.01775743 0.01253679 -Inf Inf 0.6925553
ACF1
Training set -0.0002058323
In the initial stages of Time Series Analysis, I explored various ARIMA models to identify the best fit for the data using AIC and BIC criteria. Both criteria converged on the same ARIMA model: ARIMA(1, 1, 1). Since AIC and BIC agreed on the best model, this suggests that further exploration of ARIMA models (e.g., adding more AR or MA terms) would likely lead to overfitting without significant improvement in model accuracy.
Upon analyzing the residuals from the selected ARIMA(1, 1, 1) model, it became clear that some autocorrelation remains, indicating that the model did not fully capture all the dependencies in the data. Despite this, adding additional AR or MA terms would not necessarily address these issues, as the model selection process already chose the best possible model based on statistical criteria.
Given that the series has already been differenced to achieve stationarity, there is no need for further differencing.
ARIMAX (AutoRegressive Integrated Moving Average with Exogenous Variables) extends ARIMA by incorporating external predictors. It is typically used when external factors significantly influence the target variable beyond autoregressive and moving average components.
Purpose: Introduced ARIMAX to assess whether adding external variables (filtered via VIF to avoid collinearity issues with computations) could enhance the model’s performance by addressing volatility or autocorrelation in the differenced close data. Given that ARIMA(1,1,1) residuals displayed volatility spikes, the aim is to see if exogenous variables could help.
Thus, I will proceed by exploring more advanced models to capture the remaining patterns in the data, rather than trying additional ARIMA variations.
ro.r(''' # Ensure differenced_close_ts and x_arimax cover the same time period
# Slice differenced_close_ts to match training data (assuming it's time-series)
differenced_close_train <- differenced_close[1:nrow(transformed_df)] # Adjust to fit training set size
# Create the x_arimax matrix using only the predictors filtered from VIF analysis (without intercept)
# Variables left after VIF filtering were: log_Volume, cube_volume_adi, cube_volume_obv, and volume_vpt
# Calculate correlation matrix for the selected variables
cor_matrix <- cor(transformed_df[, c("log_Volume", "cube_volume_obv", "volume_vpt", "cube_volume_adi", 'cube_momentum_ao')])
# Print the correlation matrix
print(cor_matrix)
# Given the correlation matrix, I tried log_Volume + cube_momentum_ao, and compared it to just log_Volume and cube_momentum_ao did not help, so I will only use ARIMAX with log_Volume
x_arimax <- model.matrix(log_Close ~ log_Volume - 1, transformed_df)
# Fit the ARIMAX model with the selected predictors (exclude intercept)
arimax_model <- arima(differenced_close_train, order = c(1, 1, 1), xreg = x_arimax)
# Summarize the ARIMAX model
print(summary(arimax_model))
# Extract residuals from the ARIMAX model
residuals_arimax <- residuals(arimax_model)
# Plot residuals from ARIMAX model
png(filename = "residuals_arimax_regularized.png", width = 800, height = 600)
plot(residuals_arimax, main="Residuals from ARIMAX(1, 1, 1)", ylab="Residuals", xlab="Time")
dev.off()
''')
# Display the residuals plot
display_r_plot("residuals_arimax_regularized.png")
ro.r('''
# Check the ACF of the residuals to check for any autocorrelation
png(filename = "acf_residuals_arimax_regularized.png", width = 800, height = 600)
acf(residuals_arimax, main="ACF of Residuals from ARIMAX(1, 1, 1)")
dev.off()
''')
# Display the ACF plot
display_r_plot("acf_residuals_arimax_regularized.png")
log_Volume cube_volume_obv volume_vpt cube_volume_adi
log_Volume 1.00000000 -0.1633523 -0.1980597 -0.4483261
cube_volume_obv -0.16335225 1.0000000 0.7946242 0.7010019
volume_vpt -0.19805974 0.7946242 1.0000000 0.6361410
cube_volume_adi -0.44832610 0.7010019 0.6361410 1.0000000
cube_momentum_ao -0.05157043 0.1718374 0.1443486 0.2397274
cube_momentum_ao
log_Volume -0.05157043
cube_volume_obv 0.17183744
volume_vpt 0.14434862
cube_volume_adi 0.23972739
cube_momentum_ao 1.00000000
Call:
arima(x = differenced_close_train, order = c(1, 1, 1), xreg = x_arimax)
Coefficients:
ar1 ma1 log_Volume
-0.0511 -1.0000 -2e-04
s.e. 0.0190 0.0018 5e-04
sigma^2 estimated as 0.0003178: log likelihood = 7215.24, aic = -14422.49
Training set error measures:
ME RMSE MAE MPE MAPE MASE
Training set -4.237063e-05 0.01782374 0.0123838 -Inf Inf 0.6904245
ACF1
Training set -0.0004716487
ro.r(''' # Perform Ljung-Box test to check for autocorrelation in residuals
ljung_box_test_arimax <- Box.test(residuals_arimax, lag = 10, type = "Ljung-Box")
print(ljung_box_test_arimax)
''')
Box-Ljung test Box-Ljung test data: residuals_arimax X-squared = 48.151, df = 10, p-value = 5.826e-07
After establishing that the ARIMA(1,1,1) model adequately handled stationarity by ensuring a zero-mean and constant variance, I introduced the ARIMAX model to assess whether adding external variables (specifically VIF-filtered variables) could improve the model’s performance. However, the ARIMAX model did not lead to meaningful improvements for two primary reasons:
Singularity Problems: When fitting ARIMAX with multiple predictors or collinear variables, the model encountered singularity issues. Singularity occurs when predictors are highly collinear, leading to unreliable coefficient estimates. In this case, the external variables added little new information beyond what the ARIMA model already captured.
Volatility Clustering: The ARIMAX residuals still showed signs of autocorrelation, indicating unresolved volatility clustering. ARIMAX models are designed to capture mean shifts due to external variables, but they are not equipped to address time-varying volatility, which was evident in the data.
The ARIMAX model did not improve performance because the core issue lies in volatility clustering rather than mean shifts. Since ARIMAX is intended to account for changes in the mean from external shocks, it did not address the volatility in the residuals. Thus, there is no justification for testing additional exogenous variables at this stage.
Instead, I will shift focus to more advanced models designed for volatility, specifically GARCH and EGARCH, which are better suited for handling time-varying volatility. These models should resolve the remaining issues observed in the residuals.
Volatility Clustering: A Box-Ljung Test returned a p-value of 6.213e-07, confirming that autocorrelation is still present in the residuals. This indicates that the primary issue is volatility, not mean shifts, making it inefficient to continue testing more external variables.
Singularity: The singularity problems further support that the added predictors do not provide enough new information to justify additional testing with ARIMAX.
Note: I will revisit ARIMAX using the VIX as an external variable, but for now, the focus remains on volatility modeling using GARCH and EGARCH.
Given the presence of volatility clustering, I will now transition to GARCH and EGARCH models. These models are designed to capture time-varying volatility, which ARIMA and ARIMAX failed to address. Financial data often exhibit clusters of volatility that cannot be explained by shifts in the mean alone, making these models particularly suitable for this analysis.
The GARCH(1,1) model captures conditional volatility, while the EGARCH(1,1) model accounts for asymmetric effects, where negative shocks have a larger impact on volatility. These features are essential for modeling financial market behaviors where large price swings tend to cluster.
By testing these models and their variations (with Normal and Student-t distributions), I aim to identify the most effective model for capturing both mean and volatility dynamics in the dataset.
ro.r(''' # GARCH modeling
# Step 1: Define GARCH(1,1) with no mean model specification
garch_spec <- ugarchspec(
variance.model = list(model = "sGARCH", garchOrder = c(1, 1)),
mean.model = list(armaOrder = c(0, 0), include.mean = FALSE), # No ARMA mean model since ARIMAX handled it
distribution.model = "norm"
)
# Step 2: Fit the GARCH(1,1) model using the differenced log close prices (training data)
garch_fit <- ugarchfit(spec = garch_spec, data = differenced_close_train)
# Step 3: Print the summary of the GARCH(1,1) model
print(garch_fit)
# Step 4: Extract residuals for diagnostics
garch_residuals <- residuals(garch_fit, standardize = TRUE)
# Convert residuals to a time series object
garch_residuals_ts <- ts(garch_residuals, start = c(2009, 1), frequency = 252) # Adjust the start date as needed
# Step 5: Plot GARCH residuals with the correct time index
png(filename = "garch_residuals_fixed.png", width = 800, height = 600)
plot(garch_residuals_ts, main="Residuals from GARCH(1,1)", ylab="Standardized Residuals", xlab="Time")
dev.off()
# Step 6: ACF of GARCH residuals
png(filename = "garch_acf_residuals_fixed.png", width = 800, height = 600)
acf(garch_residuals_ts, main="ACF of GARCH(1,1) Residuals")
dev.off()
''')
# Display the residuals plot
display_r_plot("garch_residuals_fixed.png")
# Display the ACF plot
display_r_plot("garch_acf_residuals_fixed.png")
ro.r('''
# Step 1: Define GARCH(1,1) model with ARMA(1,1) mean model specification
garch_spec_arma <- ugarchspec(
variance.model = list(model = "sGARCH", garchOrder = c(1, 1)),
mean.model = list(armaOrder = c(1, 1), include.mean = TRUE), # ARMA(1,1) + GARCH(1,1)
distribution.model = "norm"
)
# Step 2: Fit ARMA(1,1) + GARCH(1,1) model using the differenced data, so it is technically ARIMA(1,1,1)
garch_fit_arma <- ugarchfit(spec = garch_spec_arma, data = differenced_close_train)
# Step 3: Print the summary of the ARMA(1,1) + GARCH(1,1) model
print(garch_fit_arma)
''')
ro.r('''
# Step 4: Extract residuals for diagnostics
arma_garch_residuals <- residuals(garch_fit_arma, standardize = TRUE)
# Convert residuals to a time series object (adjust dates as needed)
arma_garch_residuals_ts <- ts(arma_garch_residuals, start = c(2009, 1), frequency = 252) # Adjust the start date if necessary
# Step 5: Plot ARMA(1,1) + GARCH(1,1) residuals
png(filename = "arma_garch_residuals.png", width = 800, height = 600)
plot(arma_garch_residuals_ts, main="Residuals from ARMA(1,1) + GARCH(1,1)", ylab="Standardized Residuals", xlab="Time")
dev.off()
''')
display_r_plot('arma_garch_residuals.png')
ro.r('''
# Step 6: ACF of ARMA(1,1) + GARCH(1,1) residuals
png(filename = "arma_garch_acf_residuals.png", width = 800, height = 600)
acf(arma_garch_residuals_ts, main="ACF of ARMA(1,1) + GARCH(1,1) Residuals")
dev.off()
''')
display_r_plot('arma_garch_acf_residuals.png')
*---------------------------------*
* GARCH Model Fit *
*---------------------------------*
Conditional Variance Dynamics
-----------------------------------
GARCH Model : sGARCH(1,1)
Mean Model : ARFIMA(0,0,0)
Distribution : norm
Optimal Parameters
------------------------------------
Estimate Std. Error t value Pr(>|t|)
omega 0.000019 0.000003 5.8135 0
alpha1 0.111942 0.016161 6.9266 0
beta1 0.828989 0.021810 38.0089 0
Robust Standard Errors:
Estimate Std. Error t value Pr(>|t|)
omega 0.000019 0.000005 3.9099 9.2e-05
alpha1 0.111942 0.024090 4.6469 3.0e-06
beta1 0.828989 0.030289 27.3694 0.0e+00
LogLikelihood : 7439.001
Information Criteria
------------------------------------
Akaike -5.3709
Bayes -5.3645
Shibata -5.3709
Hannan-Quinn -5.3686
Weighted Ljung-Box Test on Standardized Residuals
------------------------------------
statistic p-value
Lag[1] 0.5466 0.4597
Lag[2*(p+q)+(p+q)-1][2] 0.5565 0.6677
Lag[4*(p+q)+(p+q)-1][5] 1.8415 0.6563
d.o.f=0
H0 : No serial correlation
Weighted Ljung-Box Test on Standardized Squared Residuals
------------------------------------
statistic p-value
Lag[1] 0.2612 0.6093
Lag[2*(p+q)+(p+q)-1][5] 1.2454 0.8021
Lag[4*(p+q)+(p+q)-1][9] 1.8775 0.9191
d.o.f=2
Weighted ARCH LM Tests
------------------------------------
Statistic Shape Scale P-Value
ARCH Lag[3] 0.2657 0.500 2.000 0.6063
ARCH Lag[5] 1.2770 1.440 1.667 0.6529
ARCH Lag[7] 1.3641 2.315 1.543 0.8482
Nyblom stability test
------------------------------------
Joint Statistic: 0.5147
Individual Statistics:
omega 0.1124
alpha1 0.2077
beta1 0.1487
Asymptotic Critical Values (10% 5% 1%)
Joint Statistic: 0.846 1.01 1.35
Individual Statistic: 0.35 0.47 0.75
Sign Bias Test
------------------------------------
t-value prob sig
Sign Bias 0.8015 0.42293
Negative Sign Bias 1.5746 0.11547
Positive Sign Bias 0.9429 0.34581
Joint Effect 11.2639 0.01038 **
Adjusted Pearson Goodness-of-Fit Test:
------------------------------------
group statistic p-value(g-1)
1 20 151.9 9.227e-23
2 30 166.1 3.637e-21
3 40 171.7 1.439e-18
4 50 181.8 3.777e-17
Elapsed time : 0.129421
*---------------------------------*
* GARCH Model Fit *
*---------------------------------*
Conditional Variance Dynamics
-----------------------------------
GARCH Model : sGARCH(1,1)
Mean Model : ARFIMA(1,0,1)
Distribution : norm
Optimal Parameters
------------------------------------
Estimate Std. Error t value Pr(>|t|)
mu 0.001796 0.000286 6.2786 0.000000
ar1 -0.666769 0.344172 -1.9373 0.052707
ma1 0.659638 0.346961 1.9012 0.057277
omega 0.000019 0.000003 6.1968 0.000000
alpha1 0.126861 0.017748 7.1480 0.000000
beta1 0.816127 0.021689 37.6292 0.000000
Robust Standard Errors:
Estimate Std. Error t value Pr(>|t|)
mu 0.001796 0.000313 5.7377 0.0e+00
ar1 -0.666769 0.102526 -6.5034 0.0e+00
ma1 0.659638 0.102530 6.4336 0.0e+00
omega 0.000019 0.000005 4.1221 3.8e-05
alpha1 0.126861 0.025731 4.9302 1.0e-06
beta1 0.816127 0.030438 26.8129 0.0e+00
LogLikelihood : 7458.195
Information Criteria
------------------------------------
Akaike -5.3826
Bayes -5.3697
Shibata -5.3826
Hannan-Quinn -5.3780
Weighted Ljung-Box Test on Standardized Residuals
------------------------------------
statistic p-value
Lag[1] 1.396 0.2373
Lag[2*(p+q)+(p+q)-1][5] 2.417 0.8206
Lag[4*(p+q)+(p+q)-1][9] 5.423 0.3625
d.o.f=2
H0 : No serial correlation
Weighted Ljung-Box Test on Standardized Squared Residuals
------------------------------------
statistic p-value
Lag[1] 0.2591 0.6108
Lag[2*(p+q)+(p+q)-1][5] 1.6148 0.7116
Lag[4*(p+q)+(p+q)-1][9] 2.4440 0.8459
d.o.f=2
Weighted ARCH LM Tests
------------------------------------
Statistic Shape Scale P-Value
ARCH Lag[3] 0.4541 0.500 2.000 0.5004
ARCH Lag[5] 1.8084 1.440 1.667 0.5151
ARCH Lag[7] 1.9533 2.315 1.543 0.7272
Nyblom stability test
------------------------------------
Joint Statistic: 1.2056
Individual Statistics:
mu 0.17028
ar1 0.04929
ma1 0.05024
omega 0.10574
alpha1 0.17120
beta1 0.13403
Asymptotic Critical Values (10% 5% 1%)
Joint Statistic: 1.49 1.68 2.12
Individual Statistic: 0.35 0.47 0.75
Sign Bias Test
------------------------------------
t-value prob sig
Sign Bias 0.4887 0.62513
Negative Sign Bias 1.2802 0.20059
Positive Sign Bias 1.1558 0.24786
Joint Effect 8.4878 0.03694 **
Adjusted Pearson Goodness-of-Fit Test:
------------------------------------
group statistic p-value(g-1)
1 20 139.6 2.154e-20
2 30 150.3 2.614e-18
3 40 164.7 2.224e-17
4 50 175.8 3.481e-16
Elapsed time : 0.426626
ro.r(''' # Autocorrelation significance test
# Perform Ljung-Box test to check for autocorrelation in GARCH(1,1) residuals
ljung_box_test_garch <- Box.test(garch_residuals, lag = 10, type = "Ljung-Box")
print(ljung_box_test_garch)
''')
ro.r('''
# Perform Ljung-Box test to check for autocorrelation in ARMA(1,1) + GARCH(1,1) residuals
ljung_box_test_arma_garch <- Box.test(arma_garch_residuals, lag = 10, type = "Ljung-Box")
print(ljung_box_test_arma_garch)
''')
Box-Ljung test Box-Ljung test data: garch_residuals X-squared = 14.705, df = 10, p-value = 0.1432 Box-Ljung test data: arma_garch_residuals X-squared = 14.738, df = 10, p-value = 0.1419
ro.r(''' # Normal vs std distribution
# Model 1: GARCH(1,1) + ARMA(1,1) with normal distribution
garch_spec_normal <- ugarchspec(
variance.model = list(model = "sGARCH", garchOrder = c(1, 1)),
mean.model = list(armaOrder = c(1, 1), include.mean = TRUE), # ARMA(1,1)
distribution.model = "norm"
)
garch_fit_normal <- ugarchfit(spec = garch_spec_normal, data = differenced_close_train)
aic_normal <- infocriteria(garch_fit_normal)["Akaike",]
bic_normal <- infocriteria(garch_fit_normal)["Bayes",]
print(paste("AIC (Normal):", aic_normal))
print(paste("BIC (Normal):", bic_normal))
# Model 2: GARCH(1,1) + ARMA(1,1) with Student-t distribution
garch_spec_student <- ugarchspec(
variance.model = list(model = "sGARCH", garchOrder = c(1, 1)),
mean.model = list(armaOrder = c(1, 1), include.mean = TRUE), # ARMA(1,1)
distribution.model = "std" # Student-t distribution
)
garch_fit_student <- ugarchfit(spec = garch_spec_student, data = differenced_close_train)
aic_student <- infocriteria(garch_fit_student)["Akaike",]
bic_student <- infocriteria(garch_fit_student)["Bayes",]
print(paste("AIC (Student-t):", aic_student))
print(paste("BIC (Student-t):", bic_student))
# Model 3: GARCH(1,1) with no mean model (Normal distribution)
garch_spec_no_mean_normal <- ugarchspec(
variance.model = list(model = "sGARCH", garchOrder = c(1, 1)),
mean.model = list(armaOrder = c(0, 0), include.mean = FALSE), # No mean model
distribution.model = "norm"
)
garch_fit_no_mean_normal <- ugarchfit(spec = garch_spec_no_mean_normal, data = differenced_close_train)
aic_no_mean_normal <- infocriteria(garch_fit_no_mean_normal)["Akaike",]
bic_no_mean_normal <- infocriteria(garch_fit_no_mean_normal)["Bayes",]
print(paste("AIC (No Mean, Normal):", aic_no_mean_normal))
print(paste("BIC (No Mean, Normal):", bic_no_mean_normal))
# Model 4: GARCH(1,1) with no mean model (Student-t distribution)
garch_spec_no_mean_student <- ugarchspec(
variance.model = list(model = "sGARCH", garchOrder = c(1, 1)),
mean.model = list(armaOrder = c(0, 0), include.mean = FALSE), # No mean model
distribution.model = "std"
)
garch_fit_no_mean_student <- ugarchfit(spec = garch_spec_no_mean_student, data = differenced_close_train)
aic_no_mean_student <- infocriteria(garch_fit_no_mean_student)["Akaike",]
bic_no_mean_student <- infocriteria(garch_fit_no_mean_student)["Bayes",]
print(paste("AIC (No Mean, Student-t):", aic_no_mean_student))
print(paste("BIC (No Mean, Student-t):", bic_no_mean_student))
# Model 5: EGARCH(1,1) with Normal distribution
egarch_spec_normal <- ugarchspec(
variance.model = list(model = "eGARCH", garchOrder = c(1, 1)),
mean.model = list(armaOrder = c(1, 1), include.mean = TRUE), # ARMA(1,1)
distribution.model = "norm"
)
egarch_fit_normal <- ugarchfit(spec = egarch_spec_normal, data = differenced_close_train)
aic_egarch_normal <- infocriteria(egarch_fit_normal)["Akaike",]
bic_egarch_normal <- infocriteria(egarch_fit_normal)["Bayes",]
print(paste("AIC (EGARCH Normal):", aic_egarch_normal))
print(paste("BIC (EGARCH Normal):", bic_egarch_normal))
# Model 6: EGARCH(1,1) with Student-t distribution
egarch_spec_student <- ugarchspec(
variance.model = list(model = "eGARCH", garchOrder = c(1, 1)),
mean.model = list(armaOrder = c(1, 1), include.mean = TRUE), # ARMA(1,1)
distribution.model = "std"
)
egarch_fit_student <- ugarchfit(spec = egarch_spec_student, data = differenced_close_train)
aic_egarch_student <- infocriteria(egarch_fit_student)["Akaike",]
bic_egarch_student <- infocriteria(egarch_fit_student)["Bayes",]
print(paste("AIC (EGARCH Student-t):", aic_egarch_student))
print(paste("BIC (EGARCH Student-t):", bic_egarch_student))
''')
[1] "AIC (Normal): -5.38258925834721" [1] "BIC (Normal): -5.36974799827534" "AIC (Normal): -5.38258925834721" [1] "BIC (Normal): -5.36974799827534" [1] "AIC (Student-t): -5.49079071140935" [1] "BIC (Student-t): -5.4758092413255" [1] "AIC (No Mean, Normal): -5.37089275542401" [1] "BIC (No Mean, Normal): -5.36447212538807" [1] "AIC (No Mean, Student-t): -5.47922901823593" [1] "BIC (No Mean, Student-t): -5.47066817818801" [1] "AIC (EGARCH Normal): -5.41517447172052" [1] "BIC (EGARCH Normal): -5.40019300163666" [1] "AIC (EGARCH Student-t): -5.51378388352096" [1] "BIC (EGARCH Student-t): -5.49666220342513"
ro.r(''' # Extract residuals for each model
garch_residuals_normal <- residuals(garch_fit_normal, standardize = TRUE)
garch_residuals_student <- residuals(garch_fit_student, standardize = TRUE)
garch_residuals_no_mean_student <- residuals(garch_fit_no_mean_student, standardize = TRUE)
garch_residuals_no_mean_normal <- residuals(garch_fit_no_mean_normal, standardize = TRUE)
garch_residuals_student <- residuals(garch_fit_student, standardize = TRUE)
egarch_residuals_normal <- residuals(egarch_fit_normal, standardize = TRUE)
egarch_residuals_student <- residuals(egarch_fit_student, standardize = TRUE)
# Perform Ljung-Box test for all models
ljung_box_garch_normal <- Box.test(garch_residuals_normal, lag = 10, type = "Ljung-Box")
print(ljung_box_garch_normal)
ljung_box_garch_student <- Box.test(garch_residuals_student, lag = 10, type = "Ljung-Box")
print(ljung_box_garch_student)
ljung_box_garch_normal <- Box.test(garch_residuals_no_mean_normal, lag = 10, type = "Ljung-Box")
print(ljung_box_garch_normal)
ljung_box_garch_student <- Box.test(garch_residuals_no_mean_student, lag = 10, type = "Ljung-Box")
print(ljung_box_garch_student)
ljung_box_egarch_normal <- Box.test(egarch_residuals_normal, lag = 10, type = "Ljung-Box")
print(ljung_box_egarch_normal)
ljung_box_egarch_student <- Box.test(egarch_residuals_student, lag = 10, type = "Ljung-Box")
print(ljung_box_egarch_student)
''')
Box-Ljung test data: garch_residuals_normal X-squared = 14.738, df = 10, p-value = 0.1419 Box-Ljung test data: garch_residuals_student X-squared = 14.601, df = 10, p-value = 0.1473 Box-Ljung test data: garch_residuals_no_mean_normal X-squared = 14.705, df = 10, p-value = 0.1432 Box-Ljung test data: garch_residuals_no_mean_student X-squared = 14.482, df = 10, p-value = 0.1521 Box-Ljung test data: egarch_residuals_normal X-squared = 15.028, df = 10, p-value = 0.131 Box-Ljung test data: egarch_residuals_student X-squared = 14.359, df = 10, p-value = 0.1572
After conducting ARIMA and ARIMAX modeling, I found that volatility clustering remained a core issue that those models could not address. Thus, I moved forward with GARCH and EGARCH models to better handle time-varying volatility in the data.
To fully capture both mean and volatility, I tested several GARCH and EGARCH variants. Some models incorporated an ARMA(1,1) mean model, while others did not.
Here are the six models tested:
| Model | AIC | BIC |
|---|---|---|
| GARCH(1,1) with Normal | -5.3826 | -5.3697 |
| GARCH(1,1) with Student-t | -5.4908 | -5.4758 |
| GARCH(1,1) No Mean, Normal | -5.3709 | -5.3645 |
| GARCH(1,1) No Mean, Student-t | -5.4792 | -5.4707 |
| EGARCH(1,1) with Normal | -5.4152 | -5.4002 |
| EGARCH(1,1) with Student-t | -5.5138 | -5.4967 |
| Model | Box-Ljung Test (p-value) |
|---|---|
| GARCH(1,1) with Normal | 0.1418 |
| GARCH(1,1) with Student-t | 0.1473 |
| GARCH(1,1) No Mean, Normal | 0.1432 |
| GARCH(1,1) No Mean, Student-t | 0.1522 |
| EGARCH(1,1) with Normal | 0.131 |
| EGARCH(1,1) with Student-t | 0.1572 |
EGARCH (Exponential GARCH): This model can capture asymmetric effects, where negative shocks tend to increase volatility more than positive shocks.
Student-t Distribution: Its heavy tails better capture large movements in stock prices than the normal distribution.
ARIMA(1,1,1) + EGARCH(1,1): By incorporating the ARIMA(1,1,1) model, I addressed the autocorrelation and stationarity in the mean, while EGARCH(1,1) captured the time-varying volatility.
Conclusion: The EGARCH(1,1) with Student-t distribution model provides the best fit based on statistical criteria (AIC, BIC) and residual diagnostics, effectively handling both the mean structure and volatility clustering in the AAPL data.
Having identified the EGARCH(1,1) with Student-t distribution model as the best-performing time-series model, I am now incorporating its volatility predictions into the regression framework. The goal is to enhance the performance of both robust and regularized regression methods by accounting for the time-varying volatility captured by EGARCH. This step will allow the model to handle not just the mean structure, but also the heteroscedasticity inherent in the data, improving predictive accuracy.
ro.r(''' # Integrate EGARCH into regression models
set.seed(123)
# Step 1: Generate Volatility Predictions from EGARCH(1,1) with Student-t distribution
egarch_volatility <- sigma(egarch_fit_student) # Extract the volatility from EGARCH
# Add the volatility predictions as a new column to the training dataset
transformed_df$egarch_volatility <- egarch_volatility
# Step 2: Update Robust Regression Models with EGARCH Volatility
# Fit robust regression model
robust_model_egarch <- rlm(log_Close ~ . + egarch_volatility, data = transformed_df)
# Predict on the test data with added volatility feature
transformed_test_df$egarch_volatility <- sigma(egarch_fit_student)[1:nrow(transformed_test_df)]
robust_preds_egarch <- predict(robust_model_egarch, newdata = transformed_test_df)
# Calculate MSPE for robust model with EGARCH volatility
robust_mspe_egarch <- mean((test_y - robust_preds_egarch)^2)
cat("EGARCH-Enhanced Huber Regression MSPE:", robust_mspe_egarch, "\n")
# Step 3: Update QR and LTS Models with EGARCH Volatility
# Fit Quantile Regression (QR) model with EGARCH volatility at tau = 0.5 (median)
qr_model_egarch <- rq(log_Close ~ . + egarch_volatility, data = transformed_df, tau = 0.5)
# Predict on the test data for QR model
qr_preds_egarch <- predict(qr_model_egarch, newdata = transformed_test_df)
# Calculate MSPE for QR model with EGARCH volatility
qr_mspe_egarch <- mean((test_y - qr_preds_egarch)^2)
cat("EGARCH-Enhanced Quantile Regression MSPE:", qr_mspe_egarch, "\n")
# Fit Least Trimmed Squares (LTS) model with EGARCH volatility
lts_model_egarch <- lqs(log_Close ~ . + egarch_volatility, data = transformed_df)
# Predict on the test data for LTS model
lts_preds_egarch <- predict(lts_model_egarch, newdata = transformed_test_df)
# Calculate MSPE for LTS model with EGARCH volatility
lts_mspe_egarch <- mean((test_y - lts_preds_egarch)^2)
cat("EGARCH-Enhanced Least Trimmed Squares Regression MSPE:", lts_mspe_egarch, "\n")
# Step 3: Update Regularized Regression Models with EGARCH Volatility
# Convert predictors and response using model.matrix including EGARCH volatility
x_egarch <- model.matrix(log_Close ~ . + egarch_volatility, transformed_df)[, -1] # Exclude intercept
y <- transformed_df$log_Close # Response variable
# Fit Ridge, Lasso, and ElasticNet models with EGARCH volatility
ridge_model_egarch <- cv.glmnet(x_egarch, y, alpha = 0)
lasso_model_egarch <- cv.glmnet(x_egarch, y, alpha = 1)
elasticnet_model_egarch <- cv.glmnet(x_egarch, y, alpha = 0.5)
# Predict on test data
x_test_egarch <- model.matrix(log_Close ~ . + egarch_volatility, transformed_test_df)[, -1] # Exclude intercept
ridge_preds_egarch <- predict(ridge_model_egarch, newx = x_test_egarch)
lasso_preds_egarch <- predict(lasso_model_egarch, newx = x_test_egarch)
elasticnet_preds_egarch <- predict(elasticnet_model_egarch, newx = x_test_egarch)
# Calculate MSPE for each regularized regression model with EGARCH volatility
ridge_mspe_egarch <- mean((test_y - ridge_preds_egarch)^2)
lasso_mspe_egarch <- mean((test_y - lasso_preds_egarch)^2)
elasticnet_mspe_egarch <- mean((test_y - elasticnet_preds_egarch)^2)
cat("EGARCH-Enhanced Ridge Regression MSPE:", ridge_mspe_egarch, "\n")
cat("EGARCH-Enhanced Lasso Regression MSPE:", lasso_mspe_egarch, "\n")
cat("EGARCH-Enhanced ElasticNet Regression MSPE:", elasticnet_mspe_egarch, "\n")
# Step 4: Update VIF Model and Full Model to include EGARCH Volatility
vif_model_egarch <- lm(log_Close ~ log_Volume + cube_volume_adi + cube_volume_obv + volume_vpt + cube_momentum_ao + egarch_volatility, data = transformed_df)
full_model_egarch <- lm(log_Close ~ . + egarch_volatility, data = transformed_df)
transformed_test_df$egarch_volatility <- sigma(egarch_fit_student)[1:nrow(transformed_test_df)]
vif_preds_egarch <- predict(vif_model_egarch, newdata = transformed_test_df)
vif_mspe_egarch <- mean((test_y - vif_preds_egarch)^2)
full_preds_egarch <- predict(full_model_egarch, newdata = transformed_test_df)
full_mspe_egarch <- mean((test_y - full_preds_egarch)^2)
cat("EGARCH-Enhanced VIF Model MSPE:", vif_mspe_egarch, "\n")
cat("EGARCH-Enhanced Full Model MSPE:", full_mspe_egarch, "\n")
''')
EGARCH-Enhanced Huber Regression MSPE: 4.950549e-05 4.950549e-05 EGARCH-Enhanced Quantile Regression MSPE: 5.079661e-05 EGARCH-Enhanced Least Trimmed Squares Regression MSPE: 0.0001883157 EGARCH-Enhanced Ridge Regression MSPE: 0.0006812467 EGARCH-Enhanced Lasso Regression MSPE: 0.002135133 EGARCH-Enhanced ElasticNet Regression MSPE: 0.0008295045 EGARCH-Enhanced VIF Model MSPE: 0.1123627 EGARCH-Enhanced Full Model MSPE: 6.050708e-05
ro.r(''' # Helper function to apply smearing (bias correction) and calculate MSPE in original scale
calculate_mspe_with_smearing <- function(log_preds, log_true) {
# Exponentiate the predictions and true values to get them on the original scale
pred_original <- exp(log_preds)
true_original <- exp(log_true)
# Calculate residuals on the log scale
residuals_log <- log_true - log_preds
# Bias correction using the smearing estimator
CF <- mean(exp(residuals_log)) # Correction factor for bias
adjusted_preds_original <- pred_original * CF
# Calculate MSPE on the original scale
mspe_original <- mean((true_original - adjusted_preds_original)^2)
return(mspe_original)
}
# Step 4: Apply smearing for all EGARCH-enhanced models
# Huber Regression with EGARCH Volatility (Original Scale)
robust_mspe_egarch_original <- calculate_mspe_with_smearing(robust_preds_egarch, test_y)
cat("EGARCH-Enhanced Huber Regression MSPE (Original Scale):", robust_mspe_egarch_original, "\n")
# Quantile Regression (QR) with EGARCH Volatility (Original Scale)
qr_mspe_egarch_original <- calculate_mspe_with_smearing(qr_preds_egarch, test_y)
cat("EGARCH-Enhanced Quantile Regression MSPE (Original Scale):", qr_mspe_egarch_original, "\n")
# Least Trimmed Squares (LTS) with EGARCH Volatility (Original Scale)
lts_mspe_egarch_original <- calculate_mspe_with_smearing(lts_preds_egarch, test_y)
cat("EGARCH-Enhanced Least Trimmed Squares Regression MSPE (Original Scale):", lts_mspe_egarch_original, "\n")
# Ridge Regression with EGARCH Volatility (Original Scale)
ridge_mspe_egarch_original <- calculate_mspe_with_smearing(ridge_preds_egarch, test_y)
cat("EGARCH-Enhanced Ridge Regression MSPE (Original Scale):", ridge_mspe_egarch_original, "\n")
# Lasso Regression with EGARCH Volatility (Original Scale)
lasso_mspe_egarch_original <- calculate_mspe_with_smearing(lasso_preds_egarch, test_y)
cat("EGARCH-Enhanced Lasso Regression MSPE (Original Scale):", lasso_mspe_egarch_original, "\n")
# ElasticNet Regression with EGARCH Volatility (Original Scale)
elasticnet_mspe_egarch_original <- calculate_mspe_with_smearing(elasticnet_preds_egarch, test_y)
cat("EGARCH-Enhanced ElasticNet Regression MSPE (Original Scale):", elasticnet_mspe_egarch_original, "\n")
vif_mspe_egarch_original <- calculate_mspe_with_smearing(vif_preds_egarch, test_y)
cat("VIF Model with EGARCH Volatility and Smearing MSPE (Original Scale):", vif_mspe_egarch_original, "\n")
# Step 6: Apply smearing bias correction for Full Model with EGARCH Volatility
full_model_mspe_egarch_original <- calculate_mspe_with_smearing(full_preds_egarch, test_y)
cat("Full Model with EGARCH Volatility and Smearing MSPE (Original Scale):", full_model_mspe_egarch_original, "\n")
''')
EGARCH-Enhanced Huber Regression MSPE (Original Scale): 1.097216 EGARCH-Enhanced Quantile Regression MSPE (Original Scale): 1.090426 EGARCH-Enhanced Least Trimmed Squares Regression MSPE (Original Scale): 4.457868 EGARCH-Enhanced Ridge Regression MSPE (Original Scale): 15.32344 EGARCH-Enhanced Lasso Regression MSPE (Original Scale): 4.930664 EGARCH-Enhanced ElasticNet Regression MSPE (Original Scale): 8.258932 VIF Model with EGARCH Volatility and Smearing MSPE (Original Scale): 4630.835 Full Model with EGARCH Volatility and Smearing MSPE (Original Scale): 1.335404
This analysis applied robust regression methods (Quantile Regression, Huber Regression, and Least Trimmed Squares), regularized regression methods (Ridge, Lasso, and Elastic Net), and EGARCH-enhanced versions of these models. The objective was to assess whether incorporating EGARCH volatility into the models would improve predictive performance by addressing volatility and outliers in financial data.
| Model | MSPE (Log Scale) | MSPE (Original Scale) |
|---|---|---|
| EGARCH-Enhanced Huber Regression | 4.948813e-05 | 1.097161 |
| EGARCH-Enhanced Quantile Regression | 5.062403e-05 | 1.083868 |
| EGARCH-Enhanced Least Trimmed Squares Regression | 0.0001883546 | 4.45806 |
| EGARCH-Enhanced Ridge Regression | 0.000681377 | 15.3242 |
| EGARCH-Enhanced Lasso Regression | 0.002135133 | 4.930664 |
| EGARCH-Enhanced ElasticNet Regression | 0.0008286128 | 8.247073 |
| EGARCH-Enhanced VIF Model | 0.1123632 | 4630.846 |
| EGARCH-Enhanced Full Model | 6.048988e-05 | 1.335532 |
| Model | MSPE (Log Scale) | MSPE (Original Scale) |
|---|---|---|
| Quantile Regression | 3.6021e-05 | 0.8046161 |
| Huber Regression | 3.789034e-05 | 0.8522115 |
| Penalized Quantile Regression | 7.939872e-05 | 1.758258 |
| Least Trimmed Squares Regression | 0.0007393806 | 15.15739 |
| Model | MSPE (Log Scale) | MSPE (Original Scale) |
|---|---|---|
| Ridge Regression | 0.0006943541 | 16.37199 |
| Lasso Regression | 0.002135133 | 4.930664 |
| Elastic Net Regression | 0.002135133 | 4.930664 |
| VIF Model | 0.08338693 | 3076.539 |
| Full Model | 4.225889e-05 | 0.9543641 |
Increased MSPE Across Most Models: The EGARCH-enhanced models generally performed worse than their original versions, with higher MSPE on both the log and original scales. This can be attributed to the introduction of volatility features that added complexity to the models, which were not well-suited to handle the nonlinear nature of time-varying volatility.
Robust Methods with EGARCH: Both Quantile Regression and Huber Regression showed minimal increases in MSPE after incorporating EGARCH volatility. The results indicate that these models, which are inherently robust to outliers, were not significantly affected by volatility adjustments. However, their predictive accuracy did not substantially improve either, suggesting that the EGARCH volatility did not provide substantial new information.
Improvement in Least Trimmed Squares (LTS): Interestingly, Least Trimmed Squares Regression showed improvement after introducing EGARCH volatility, reducing its MSPE from 15.15739 (Original Scale) to 4.45806. This improvement suggests that LTS, which minimizes the influence of extreme values, benefited from the EGARCH volatility's ability to account for large swings in the data, reducing the impact of outliers.
Ridge Regression with EGARCH: Ridge Regression showed a slight improvement after incorporating EGARCH, with the MSPE improving on the log scale from 0.0006943541 to 0.000681377. However, the improvement was not as significant on the original scale, indicating that while Ridge Regression handled multicollinearity effectively, it did not fully capitalize on the volatility captured by EGARCH.
Lasso Regression: Lasso Regression remained stable with no significant change after the EGARCH adjustment, maintaining an MSPE of 4.930664 on both scales. This shows that Lasso was largely unaffected by the volatility adjustment, as the sparsity enforced by the L1 penalty helped retain its predictive performance.
Elastic Net Regression Got Worse: The Elastic Net Regression model's performance deteriorated with EGARCH volatility, seeing an increase in MSPE from 4.930664 to 8.247073. This suggests that the combination of L1 and L2 penalties could not balance the added complexity from the volatility, leading to a degradation in prediction accuracy.
VIF Model and Full Model with EGARCH: The VIF Model performed poorly even with EGARCH volatility, with an extremely high MSPE of 4630.846 (Original Scale). This likely resulted from the model’s inability to handle the complexity introduced by both multicollinearity and volatility, leading to poor predictions. The Full Model, though not as poor, also showed an increase in MSPE from 0.9543641 (Original Scale) to 1.335532, indicating a deterioration in performance when EGARCH volatility was introduced.
Nonlinear Complexity: EGARCH models are specifically designed to capture time-varying volatility, which is a non-linear characteristic of financial data. However, the regularized regression models (Ridge, Lasso, Elastic Net) are linear models, which means they struggle to incorporate and benefit from the volatility features introduced by EGARCH. This disconnect between the linear nature of the models and the nonlinear characteristics of the data likely led to increased MSPE.
Smearing Bias Correction: The application of smearing bias correction further exposed the limitations of these models in handling volatility. The correction for bias due to log transformation revealed that the models were consistently underperforming when predicting the original (untransformed) values of stock prices.
Impact of EGARCH on Robust Methods: While robust methods like Huber and Quantile Regression are designed to minimize the influence of extreme values, the introduction of EGARCH volatility did not provide significant new information to enhance their performance. The nature of these models already focuses on handling irregularities in the data, and thus the volatility adjustment did not substantially improve their predictions.
Improvement in LTS: The significant improvement in Least Trimmed Squares Regression after incorporating EGARCH volatility suggests that volatility features helped reduce the model’s sensitivity to extreme outliers, making it more stable and reliable in handling the financial data's inherent volatility.
EGARCH Volatility Did Not Enhance Regularized Regression: Despite the complexity introduced by EGARCH volatility, the regularized methods (Lasso, Elastic Net, and Ridge) did not benefit substantially, with only marginal improvements in Ridge Regression. The primary reason lies in the disconnect between the non-linear nature of volatility and the linear structure of regularized regression models.
Robust Methods Remain Strong Contenders: Quantile Regression and Huber Regression maintained their strength even after introducing EGARCH volatility, but they did not show significant improvement. Least Trimmed Squares (LTS) saw the most notable improvement after EGARCH was introduced, highlighting its potential for handling volatility-driven data.
Practical Implications: The results suggest that while EGARCH is highly effective for volatility modeling, its integration into linear regression models may not yield substantial benefits unless the model structure itself is adapted to handle non-linearities. Financial models that directly address volatility through methods such as LTS or Ridge Regression are more suited to capture the characteristics of the data when combined with volatility features.
The analysis demonstrated that the integration of EGARCH volatility into the regression framework did not yield significant improvements in most regularized methods. Despite the ability of the EGARCH model to capture volatility clustering, the linear nature of Ridge, Lasso, and Elastic Net models seems to have struggled to leverage the non-linear volatility insights provided by the EGARCH model.
However, there were notable improvements in Ridge and Least Trimmed Squares models, showcasing that EGARCH-enhanced volatility can enhance models that are more sensitive to outliers or multicollinearity. This insight suggests that future work should focus on models capable of handling non-linear relationships more effectively.
While the EGARCH-enhanced models did not universally improve performance across the board, they provided valuable insights into the complex volatility structures inherent in financial data. The Ridge and LTS models benefited from the volatility features, while Elastic Net and VIF models struggled to adapt. This mixed performance suggests that future work should focus on:
Exploring non-linear and machine learning approaches: Methods like Random Forests, GBM, and Neural Networks can capture complex interactions and non-linearities better than linear regression models.
Ensemble methods: Combining the strengths of robust and machine learning models through stacking or blending will provide a more comprehensive predictive framework.
Backtesting and validation: Backtesting using rolling-window and walk-forward validation is crucial to ensuring robustness in live trading scenarios.
Risk quantification: Implementing Monte Carlo simulations will enhance risk management and provide deeper insights into the range of possible outcomes under varying market conditions.
The progress made so far has laid a strong foundation for further research into creating a highly effective quantitative trading strategy. By integrating these advanced methods and focusing on non-linearity, the next stage of this project will refine and enhance the overall predictive capability.